kreuzberg 3.18.0__tar.gz → 3.20.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (362) hide show
  1. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/PKG-INFO +32 -45
  2. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_api/main.py +4 -2
  3. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_entity_extraction.py +4 -8
  4. kreuzberg-3.20.1/kreuzberg/_error_handling.py +182 -0
  5. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_base.py +2 -2
  6. kreuzberg-3.20.1/kreuzberg/_extractors/_html.py +138 -0
  7. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pdf.py +33 -54
  8. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_structured.py +1 -1
  9. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_language_detection.py +2 -0
  10. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_tesseract.py +76 -297
  11. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_types.py +143 -47
  12. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/cli.py +36 -22
  13. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/extraction.py +251 -107
  14. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/pyproject.toml +58 -74
  15. kreuzberg-3.18.0/.commitlintrc +0 -1
  16. kreuzberg-3.18.0/.deepsource.toml +0 -54
  17. kreuzberg-3.18.0/.docker/Dockerfile +0 -79
  18. kreuzberg-3.18.0/.docker/README.md +0 -190
  19. kreuzberg-3.18.0/.dockerignore +0 -15
  20. kreuzberg-3.18.0/.github/dependabot.yaml +0 -6
  21. kreuzberg-3.18.0/.github/workflows/ci.yaml +0 -381
  22. kreuzberg-3.18.0/.github/workflows/docker-e2e-tests.yml +0 -150
  23. kreuzberg-3.18.0/.github/workflows/docs.yml +0 -66
  24. kreuzberg-3.18.0/.github/workflows/pr-title.yaml +0 -20
  25. kreuzberg-3.18.0/.github/workflows/publish-docker.yml +0 -163
  26. kreuzberg-3.18.0/.github/workflows/release.yaml +0 -37
  27. kreuzberg-3.18.0/.github/workflows/test-docker-builds.yml +0 -101
  28. kreuzberg-3.18.0/.gitignore +0 -74
  29. kreuzberg-3.18.0/.markdownlint.yaml +0 -17
  30. kreuzberg-3.18.0/.pre-commit-config.yaml +0 -82
  31. kreuzberg-3.18.0/.prettierignore +0 -1
  32. kreuzberg-3.18.0/ATTRIBUTIONS.md +0 -47
  33. kreuzberg-3.18.0/LICENSE +0 -7
  34. kreuzberg-3.18.0/Taskfile.yml +0 -50
  35. kreuzberg-3.18.0/ai-rulez.yaml +0 -586
  36. kreuzberg-3.18.0/benchmarks/README.md +0 -264
  37. kreuzberg-3.18.0/benchmarks/batch_size_benchmark.py +0 -179
  38. kreuzberg-3.18.0/benchmarks/batch_validation_benchmark.py +0 -83
  39. kreuzberg-3.18.0/benchmarks/pyproject.toml +0 -29
  40. kreuzberg-3.18.0/benchmarks/src/__init__.py +0 -1
  41. kreuzberg-3.18.0/benchmarks/src/__main__.py +0 -4
  42. kreuzberg-3.18.0/benchmarks/src/benchmarks.py +0 -703
  43. kreuzberg-3.18.0/benchmarks/src/cli.py +0 -723
  44. kreuzberg-3.18.0/benchmarks/src/models.py +0 -195
  45. kreuzberg-3.18.0/benchmarks/src/profiler.py +0 -161
  46. kreuzberg-3.18.0/benchmarks/src/runner.py +0 -367
  47. kreuzberg-3.18.0/benchmarks/token_reduction_compression_benchmark.py +0 -268
  48. kreuzberg-3.18.0/docs/advanced/custom-extractors.md +0 -203
  49. kreuzberg-3.18.0/docs/advanced/custom-hooks.md +0 -148
  50. kreuzberg-3.18.0/docs/advanced/error-handling.md +0 -181
  51. kreuzberg-3.18.0/docs/advanced/index.md +0 -41
  52. kreuzberg-3.18.0/docs/advanced/performance.md +0 -306
  53. kreuzberg-3.18.0/docs/api-reference/exceptions.md +0 -33
  54. kreuzberg-3.18.0/docs/api-reference/extraction-functions.md +0 -59
  55. kreuzberg-3.18.0/docs/api-reference/extractor-registry.md +0 -5
  56. kreuzberg-3.18.0/docs/api-reference/index.md +0 -51
  57. kreuzberg-3.18.0/docs/api-reference/ocr-configuration.md +0 -27
  58. kreuzberg-3.18.0/docs/api-reference/types.md +0 -120
  59. kreuzberg-3.18.0/docs/assets/favicon.png +0 -0
  60. kreuzberg-3.18.0/docs/assets/logo.png +0 -0
  61. kreuzberg-3.18.0/docs/cli.md +0 -225
  62. kreuzberg-3.18.0/docs/contributing.md +0 -82
  63. kreuzberg-3.18.0/docs/css/extra.css +0 -56
  64. kreuzberg-3.18.0/docs/examples/extraction-examples.md +0 -763
  65. kreuzberg-3.18.0/docs/examples/index.md +0 -48
  66. kreuzberg-3.18.0/docs/getting-started/index.md +0 -20
  67. kreuzberg-3.18.0/docs/getting-started/installation.md +0 -154
  68. kreuzberg-3.18.0/docs/getting-started/quick-start.md +0 -111
  69. kreuzberg-3.18.0/docs/index.md +0 -60
  70. kreuzberg-3.18.0/docs/user-guide/api-server.md +0 -531
  71. kreuzberg-3.18.0/docs/user-guide/basic-usage.md +0 -161
  72. kreuzberg-3.18.0/docs/user-guide/chunking.md +0 -124
  73. kreuzberg-3.18.0/docs/user-guide/docker.md +0 -548
  74. kreuzberg-3.18.0/docs/user-guide/document-classification.md +0 -61
  75. kreuzberg-3.18.0/docs/user-guide/extraction-configuration.md +0 -966
  76. kreuzberg-3.18.0/docs/user-guide/index.md +0 -45
  77. kreuzberg-3.18.0/docs/user-guide/mcp-server.md +0 -586
  78. kreuzberg-3.18.0/docs/user-guide/metadata-extraction.md +0 -125
  79. kreuzberg-3.18.0/docs/user-guide/ocr-backends.md +0 -247
  80. kreuzberg-3.18.0/docs/user-guide/ocr-configuration.md +0 -414
  81. kreuzberg-3.18.0/docs/user-guide/supported-formats.md +0 -71
  82. kreuzberg-3.18.0/docs/user-guide/token-reduction.md +0 -251
  83. kreuzberg-3.18.0/kreuzberg/_extractors/_html.py +0 -148
  84. kreuzberg-3.18.0/kreuzberg/_utils/__init__.py +0 -0
  85. kreuzberg-3.18.0/kreuzberg/_utils/_html_streaming.py +0 -20
  86. kreuzberg-3.18.0/kreuzberg/py.typed +0 -0
  87. kreuzberg-3.18.0/mkdocs.yaml +0 -160
  88. kreuzberg-3.18.0/tests/__init__.py +0 -0
  89. kreuzberg-3.18.0/tests/api/__init__.py +0 -0
  90. kreuzberg-3.18.0/tests/api/config_cache_test.py +0 -224
  91. kreuzberg-3.18.0/tests/api/conftest.py +0 -18
  92. kreuzberg-3.18.0/tests/api/environment_config_test.py +0 -154
  93. kreuzberg-3.18.0/tests/api/header_config_hashing_test.py +0 -29
  94. kreuzberg-3.18.0/tests/api/image_extraction_test.py +0 -59
  95. kreuzberg-3.18.0/tests/api/main_test.py +0 -817
  96. kreuzberg-3.18.0/tests/api/runtime_config_test.py +0 -374
  97. kreuzberg-3.18.0/tests/conftest.py +0 -219
  98. kreuzberg-3.18.0/tests/core/__init__.py +0 -0
  99. kreuzberg-3.18.0/tests/core/comprehensive_config_test.py +0 -664
  100. kreuzberg-3.18.0/tests/core/config_test.py +0 -15
  101. kreuzberg-3.18.0/tests/core/constants_test.py +0 -22
  102. kreuzberg-3.18.0/tests/core/dpi_configuration_test.py +0 -319
  103. kreuzberg-3.18.0/tests/core/exceptions_test.py +0 -159
  104. kreuzberg-3.18.0/tests/core/extraction_batch_test.py +0 -389
  105. kreuzberg-3.18.0/tests/core/extraction_test.py +0 -494
  106. kreuzberg-3.18.0/tests/core/html_to_markdown_config_test.py +0 -0
  107. kreuzberg-3.18.0/tests/core/image_ocr_result_test.py +0 -27
  108. kreuzberg-3.18.0/tests/core/init_test.py +0 -85
  109. kreuzberg-3.18.0/tests/core/main_test.py +0 -35
  110. kreuzberg-3.18.0/tests/core/mime_types_test.py +0 -242
  111. kreuzberg-3.18.0/tests/core/registry_test.py +0 -225
  112. kreuzberg-3.18.0/tests/core/types_test.py +0 -465
  113. kreuzberg-3.18.0/tests/e2e/__init__.py +0 -0
  114. kreuzberg-3.18.0/tests/e2e/docker_e2e.py +0 -481
  115. kreuzberg-3.18.0/tests/extractors/README_image_tests.md +0 -85
  116. kreuzberg-3.18.0/tests/extractors/__init__.py +0 -0
  117. kreuzberg-3.18.0/tests/extractors/base_extractor_test.py +0 -420
  118. kreuzberg-3.18.0/tests/extractors/base_memory_limits_test.py +0 -100
  119. kreuzberg-3.18.0/tests/extractors/base_ocr_processing_test.py +0 -276
  120. kreuzberg-3.18.0/tests/extractors/base_ocr_simple_test.py +0 -64
  121. kreuzberg-3.18.0/tests/extractors/email_error_paths_test.py +0 -39
  122. kreuzberg-3.18.0/tests/extractors/email_test.py +0 -948
  123. kreuzberg-3.18.0/tests/extractors/html_invalid_base64_test.py +0 -11
  124. kreuzberg-3.18.0/tests/extractors/html_test.py +0 -52
  125. kreuzberg-3.18.0/tests/extractors/image_deduplication_test.py +0 -87
  126. kreuzberg-3.18.0/tests/extractors/image_error_handling_test.py +0 -253
  127. kreuzberg-3.18.0/tests/extractors/image_error_simple_test.py +0 -75
  128. kreuzberg-3.18.0/tests/extractors/image_test.py +0 -766
  129. kreuzberg-3.18.0/tests/extractors/json_test.py +0 -427
  130. kreuzberg-3.18.0/tests/extractors/pandoc_metadata_test.py +0 -323
  131. kreuzberg-3.18.0/tests/extractors/pandoc_test.py +0 -1995
  132. kreuzberg-3.18.0/tests/extractors/pdf_images_test.py +0 -52
  133. kreuzberg-3.18.0/tests/extractors/pdf_sync_images_test.py +0 -217
  134. kreuzberg-3.18.0/tests/extractors/pdf_test.py +0 -979
  135. kreuzberg-3.18.0/tests/extractors/presentation_test.py +0 -967
  136. kreuzberg-3.18.0/tests/extractors/spreadsheet_test.py +0 -1140
  137. kreuzberg-3.18.0/tests/extractors/structured_test.py +0 -304
  138. kreuzberg-3.18.0/tests/features/__init__.py +0 -0
  139. kreuzberg-3.18.0/tests/features/chunker_test.py +0 -94
  140. kreuzberg-3.18.0/tests/features/document_classification_test.py +0 -747
  141. kreuzberg-3.18.0/tests/features/entity_extraction_test.py +0 -279
  142. kreuzberg-3.18.0/tests/features/gmft_test.py +0 -1496
  143. kreuzberg-3.18.0/tests/features/hooks_test.py +0 -0
  144. kreuzberg-3.18.0/tests/features/language_detection_test.py +0 -343
  145. kreuzberg-3.18.0/tests/features/table_extraction_test.py +0 -0
  146. kreuzberg-3.18.0/tests/features/token_reduction_test.py +0 -813
  147. kreuzberg-3.18.0/tests/integration/__init__.py +0 -0
  148. kreuzberg-3.18.0/tests/integration/all_extractors_images_test.py +0 -252
  149. kreuzberg-3.18.0/tests/integration/api/__init__.py +0 -0
  150. kreuzberg-3.18.0/tests/integration/api/large_file_test.py +0 -0
  151. kreuzberg-3.18.0/tests/integration/api/mounted_config_test.py +0 -0
  152. kreuzberg-3.18.0/tests/integration/dpi_integration_test.py +0 -209
  153. kreuzberg-3.18.0/tests/integration/multiprocessing/__init__.py +0 -0
  154. kreuzberg-3.18.0/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  155. kreuzberg-3.18.0/tests/integration/ocr/__init__.py +0 -0
  156. kreuzberg-3.18.0/tests/integration/ocr/device_integration_test.py +0 -0
  157. kreuzberg-3.18.0/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  158. kreuzberg-3.18.0/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  159. kreuzberg-3.18.0/tests/integration/pandoc_images_test.py +0 -30
  160. kreuzberg-3.18.0/tests/integration/pdf_images_test.py +0 -18
  161. kreuzberg-3.18.0/tests/integration/pdf_real_images_test.py +0 -52
  162. kreuzberg-3.18.0/tests/integration/pptx_complex_test.py +0 -22
  163. kreuzberg-3.18.0/tests/integration/pptx_images_test.py +0 -18
  164. kreuzberg-3.18.0/tests/integration/regression_test.py +0 -134
  165. kreuzberg-3.18.0/tests/integration/token_reduction_integration_test.py +0 -173
  166. kreuzberg-3.18.0/tests/interfaces/__init__.py +0 -0
  167. kreuzberg-3.18.0/tests/interfaces/cli_test.py +0 -527
  168. kreuzberg-3.18.0/tests/interfaces/mcp_server_test.py +0 -1116
  169. kreuzberg-3.18.0/tests/mcp/__init__.py +0 -0
  170. kreuzberg-3.18.0/tests/mcp/mcp_server_test.py +0 -0
  171. kreuzberg-3.18.0/tests/multiprocessing/__init__.py +0 -0
  172. kreuzberg-3.18.0/tests/multiprocessing/gmft_isolated_test.py +0 -449
  173. kreuzberg-3.18.0/tests/multiprocessing/process_manager_test.py +0 -273
  174. kreuzberg-3.18.0/tests/multiprocessing/tesseract_pool_test.py +0 -331
  175. kreuzberg-3.18.0/tests/ocr/__init__.py +0 -0
  176. kreuzberg-3.18.0/tests/ocr/base_test.py +0 -80
  177. kreuzberg-3.18.0/tests/ocr/easyocr_test.py +0 -517
  178. kreuzberg-3.18.0/tests/ocr/init_test.py +0 -35
  179. kreuzberg-3.18.0/tests/ocr/paddleocr_test.py +0 -835
  180. kreuzberg-3.18.0/tests/ocr/tesseract_test.py +0 -1314
  181. kreuzberg-3.18.0/tests/ocr/tesseract_tsv_test.py +0 -409
  182. kreuzberg-3.18.0/tests/performance/__init__.py +0 -0
  183. kreuzberg-3.18.0/tests/performance/large_pdf_perf_test.py +0 -29
  184. kreuzberg-3.18.0/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  185. kreuzberg-3.18.0/tests/test_source_files/contract.txt +0 -1
  186. kreuzberg-3.18.0/tests/test_source_files/contract_test.txt +0 -4
  187. kreuzberg-3.18.0/tests/test_source_files/document.docx +0 -0
  188. kreuzberg-3.18.0/tests/test_source_files/email/sample-email.eml +0 -11
  189. kreuzberg-3.18.0/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  190. kreuzberg-3.18.0/tests/test_source_files/excel.xlsx +0 -0
  191. kreuzberg-3.18.0/tests/test_source_files/flower-no-text.jpg +0 -0
  192. kreuzberg-3.18.0/tests/test_source_files/form_test.txt +0 -5
  193. kreuzberg-3.18.0/tests/test_source_files/french-text.txt +0 -2
  194. kreuzberg-3.18.0/tests/test_source_files/german-text.txt +0 -2
  195. kreuzberg-3.18.0/tests/test_source_files/google-doc-document.pdf +0 -0
  196. kreuzberg-3.18.0/tests/test_source_files/html.html +0 -10
  197. kreuzberg-3.18.0/tests/test_source_files/image-only-german-pdf.pdf +0 -0
  198. kreuzberg-3.18.0/tests/test_source_files/images/test_hello_world.png +0 -0
  199. kreuzberg-3.18.0/tests/test_source_files/invoice_image.png +0 -0
  200. kreuzberg-3.18.0/tests/test_source_files/invoice_test.txt +0 -4
  201. kreuzberg-3.18.0/tests/test_source_files/json/complex_nested.json +0 -41
  202. kreuzberg-3.18.0/tests/test_source_files/json/real_world/aws_policy.json +0 -43
  203. kreuzberg-3.18.0/tests/test_source_files/json/real_world/earthquakes.geojson +0 -6
  204. kreuzberg-3.18.0/tests/test_source_files/json/real_world/github_emojis.json +0 -111
  205. kreuzberg-3.18.0/tests/test_source_files/json/real_world/iss_location.json +0 -1
  206. kreuzberg-3.18.0/tests/test_source_files/json/real_world/openapi_spec.json +0 -84
  207. kreuzberg-3.18.0/tests/test_source_files/json/real_world/package.json +0 -33
  208. kreuzberg-3.18.0/tests/test_source_files/json/real_world/rick_morty_character.json +0 -1
  209. kreuzberg-3.18.0/tests/test_source_files/json/sample-document.json +0 -1
  210. kreuzberg-3.18.0/tests/test_source_files/json/schema_test.json +0 -25
  211. kreuzberg-3.18.0/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  212. kreuzberg-3.18.0/tests/test_source_files/markdown.md +0 -1
  213. kreuzberg-3.18.0/tests/test_source_files/non-ascii-text.pdf +0 -0
  214. kreuzberg-3.18.0/tests/test_source_files/non-searchable.pdf +0 -0
  215. kreuzberg-3.18.0/tests/test_source_files/ocr-image.jpg +0 -0
  216. kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  217. kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  218. kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  219. kreuzberg-3.18.0/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  220. kreuzberg-3.18.0/tests/test_source_files/receipt_test.txt +0 -5
  221. kreuzberg-3.18.0/tests/test_source_files/report_test.txt +0 -4
  222. kreuzberg-3.18.0/tests/test_source_files/sample-contract.pdf +0 -0
  223. kreuzberg-3.18.0/tests/test_source_files/scanned.pdf +0 -0
  224. kreuzberg-3.18.0/tests/test_source_files/searchable.pdf +0 -0
  225. kreuzberg-3.18.0/tests/test_source_files/sharable-web-guide.pdf +0 -0
  226. kreuzberg-3.18.0/tests/test_source_files/spanish-text.txt +0 -2
  227. kreuzberg-3.18.0/tests/test_source_files/tables/borderless_table.png +0 -0
  228. kreuzberg-3.18.0/tests/test_source_files/tables/complex_document.png +0 -0
  229. kreuzberg-3.18.0/tests/test_source_files/tables/simple_table.png +0 -0
  230. kreuzberg-3.18.0/tests/test_source_files/test-article.pdf +0 -0
  231. kreuzberg-3.18.0/tests/test_source_files/test-excel.xls +0 -0
  232. kreuzberg-3.18.0/tests/test_source_files/yaml/sample-config.yaml +0 -15
  233. kreuzberg-3.18.0/tests/utils/__init__.py +0 -0
  234. kreuzberg-3.18.0/tests/utils/cache_test.py +0 -427
  235. kreuzberg-3.18.0/tests/utils/device_test.py +0 -347
  236. kreuzberg-3.18.0/tests/utils/errors_test.py +0 -343
  237. kreuzberg-3.18.0/tests/utils/ocr_cache_test.py +0 -286
  238. kreuzberg-3.18.0/tests/utils/pdf_lock_test.py +0 -215
  239. kreuzberg-3.18.0/tests/utils/playa_helpers_test.py +0 -0
  240. kreuzberg-3.18.0/tests/utils/playa_metadata_test.py +0 -753
  241. kreuzberg-3.18.0/tests/utils/playa_test.py +0 -315
  242. kreuzberg-3.18.0/tests/utils/process_pool_test.py +0 -223
  243. kreuzberg-3.18.0/tests/utils/quality_test.py +0 -121
  244. kreuzberg-3.18.0/tests/utils/ref_test.py +0 -90
  245. kreuzberg-3.18.0/tests/utils/serialization_test.py +0 -379
  246. kreuzberg-3.18.0/tests/utils/string_test.py +0 -251
  247. kreuzberg-3.18.0/tests/utils/sync_test.py +0 -259
  248. kreuzberg-3.18.0/tests/utils/table_test.py +0 -353
  249. kreuzberg-3.18.0/tests/utils/tmp_test.py +0 -50
  250. kreuzberg-3.18.0/uv.lock +0 -6208
  251. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/README.md +0 -0
  252. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/__init__.py +0 -0
  253. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/__main__.py +0 -0
  254. {kreuzberg-3.18.0/benchmarks → kreuzberg-3.20.1/kreuzberg/_api}/__init__.py +0 -0
  255. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_api/_config_cache.py +0 -0
  256. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_chunker.py +0 -0
  257. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_config.py +0 -0
  258. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_constants.py +0 -0
  259. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_document_classification.py +0 -0
  260. {kreuzberg-3.18.0/kreuzberg/_api → kreuzberg-3.20.1/kreuzberg/_extractors}/__init__.py +0 -0
  261. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_email.py +0 -0
  262. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_image.py +0 -0
  263. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  264. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_presentation.py +0 -0
  265. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  266. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_gmft.py +0 -0
  267. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_mcp/__init__.py +0 -0
  268. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_mcp/server.py +0 -0
  269. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_mime_types.py +0 -0
  270. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/__init__.py +0 -0
  271. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_base.py +0 -0
  272. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_easyocr.py +0 -0
  273. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
  274. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
  275. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_playa.py +0 -0
  276. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_registry.py +0 -0
  277. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/__init__.py +0 -0
  278. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_reducer.py +0 -0
  279. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  280. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  281. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  282. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  283. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  284. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  285. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  286. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  287. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  288. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  289. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  290. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  291. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  292. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  293. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  294. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  295. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  296. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  297. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  298. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  299. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  300. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  301. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  302. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  303. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  304. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  305. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  306. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  307. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  308. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  309. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  310. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  311. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  312. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  313. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  314. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  315. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  316. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  317. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  318. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  319. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  320. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  321. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  322. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  323. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  324. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  325. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  326. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  327. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  328. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  329. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  330. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  331. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  332. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  333. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  334. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  335. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  336. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  337. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  338. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  339. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  340. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  341. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  342. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  343. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  344. {kreuzberg-3.18.0/kreuzberg/_extractors → kreuzberg-3.20.1/kreuzberg/_utils}/__init__.py +0 -0
  345. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_cache.py +0 -0
  346. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_device.py +0 -0
  347. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_document_cache.py +0 -0
  348. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_errors.py +0 -0
  349. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  350. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
  351. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  352. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_process_pool.py +0 -0
  353. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_quality.py +0 -0
  354. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_ref.py +0 -0
  355. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_resource_managers.py +0 -0
  356. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_serialization.py +0 -0
  357. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_string.py +0 -0
  358. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_sync.py +0 -0
  359. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_table.py +0 -0
  360. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_tmp.py +0 -0
  361. {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/exceptions.py +0 -0
  362. {kreuzberg-3.18.0/benchmarks → kreuzberg-3.20.1/kreuzberg}/py.typed +0 -0
@@ -1,13 +1,11 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: kreuzberg
3
- Version: 3.18.0
3
+ Version: 3.20.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
- Project-URL: documentation, https://kreuzberg.dev
6
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
5
+ Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
6
+ Author: Na'aman Hirschfeld
7
7
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
8
  License: MIT
9
- License-File: LICENSE
10
- Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
9
  Classifier: Development Status :: 5 - Production/Stable
12
10
  Classifier: Intended Audience :: Developers
13
11
  Classifier: Intended Audience :: Information Technology
@@ -27,67 +25,56 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
25
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
26
  Classifier: Topic :: Text Processing :: General
29
27
  Classifier: Typing :: Typed
30
- Requires-Python: >=3.10
31
28
  Requires-Dist: anyio>=4.11.0
32
29
  Requires-Dist: chardetng-py>=0.3.5
33
- Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.16.0
30
+ Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
31
+ Requires-Dist: html-to-markdown>=2.1.0
35
32
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.15.0
33
+ Requires-Dist: mcp>=1.17.0
37
34
  Requires-Dist: msgspec>=0.18.0
38
35
  Requires-Dist: numpy>=2.0.0
39
36
  Requires-Dist: playa-pdf>=0.7.0
40
- Requires-Dist: polars>=1.33.1
37
+ Requires-Dist: polars>=1.34.0
41
38
  Requires-Dist: psutil>=7.1.0
42
39
  Requires-Dist: pypdfium2==4.30.0
43
40
  Requires-Dist: python-calamine>=0.5.3
44
41
  Requires-Dist: python-pptx>=1.0.2
45
- Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
42
+ Requires-Dist: transformers>=4.55.0
43
+ Requires-Dist: typing-extensions>=4.15.0 ; python_full_version < '3.12'
44
+ Requires-Dist: mailparse>=1.0.15 ; extra == 'additional-extensions'
45
+ Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'additional-extensions'
46
+ Requires-Dist: kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr] ; extra == 'all'
47
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0 ; extra == 'api'
48
+ Requires-Dist: semantic-text-splitter>=0.28.0 ; extra == 'chunking'
49
+ Requires-Dist: click>=8.3.0 ; extra == 'cli'
50
+ Requires-Dist: rich>=14.2.0 ; extra == 'cli'
51
+ Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'cli'
52
+ Requires-Dist: playa-pdf[crypto]>=0.7.0 ; extra == 'crypto'
53
+ Requires-Dist: deep-translator>=1.11.4 ; extra == 'document-classification'
54
+ Requires-Dist: easyocr>=1.7.2 ; python_full_version < '3.14' and extra == 'easyocr'
55
+ Requires-Dist: keybert>=0.9.0 ; extra == 'entity-extraction'
56
+ Requires-Dist: spacy>=3.8.7 ; python_full_version < '3.14' and extra == 'entity-extraction'
57
+ Requires-Dist: gmft>=0.4.2 ; extra == 'gmft'
58
+ Requires-Dist: transformers>=4.57.0 ; extra == 'gmft'
59
+ Requires-Dist: fast-langdetect>=1.0.0 ; extra == 'langdetect'
60
+ Requires-Dist: paddleocr>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
61
+ Requires-Dist: paddlepaddle>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
62
+ Requires-Dist: setuptools>=80.9.0 ; extra == 'paddleocr'
63
+ Requires-Python: >=3.10
64
+ Project-URL: documentation, https://kreuzberg.dev
65
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
46
66
  Provides-Extra: additional-extensions
47
- Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
48
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
49
67
  Provides-Extra: all
50
- Requires-Dist: click>=8.2.1; extra == 'all'
51
- Requires-Dist: deep-translator>=1.11.4; extra == 'all'
52
- Requires-Dist: easyocr>=1.7.2; extra == 'all'
53
- Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
54
- Requires-Dist: gmft>=0.4.2; extra == 'all'
55
- Requires-Dist: keybert>=0.9.0; extra == 'all'
56
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
57
- Requires-Dist: mailparse>=1.0.15; extra == 'all'
58
- Requires-Dist: paddleocr>=3.2.0; extra == 'all'
59
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
60
- Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
61
- Requires-Dist: rich>=14.1.0; extra == 'all'
62
- Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
63
- Requires-Dist: setuptools>=80.9.0; extra == 'all'
64
- Requires-Dist: spacy>=3.8.7; extra == 'all'
65
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
66
68
  Provides-Extra: api
67
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
68
69
  Provides-Extra: chunking
69
- Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
70
70
  Provides-Extra: cli
71
- Requires-Dist: click>=8.2.1; extra == 'cli'
72
- Requires-Dist: rich>=14.1.0; extra == 'cli'
73
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
74
71
  Provides-Extra: crypto
75
- Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
76
72
  Provides-Extra: document-classification
77
- Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
78
73
  Provides-Extra: easyocr
79
- Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
80
74
  Provides-Extra: entity-extraction
81
- Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
82
- Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
83
75
  Provides-Extra: gmft
84
- Requires-Dist: gmft>=0.4.2; extra == 'gmft'
85
76
  Provides-Extra: langdetect
86
- Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
87
77
  Provides-Extra: paddleocr
88
- Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
89
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
90
- Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
91
78
  Description-Content-Type: text/markdown
92
79
 
93
80
  # Kreuzberg
@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
110
110
  Environment Variables:
111
111
  KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
112
  """
113
- default_size = 1024 * 1024 * 1024 # 1GB
113
+ default_size = 1024 * 1024 * 1024
114
114
  try:
115
115
  size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
- # Return default if negative
117
116
  return size if size >= 0 else default_size
118
117
  except ValueError:
119
118
  return default_size
@@ -311,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
311
310
  """
312
311
  static_config = discover_config_cached()
313
312
 
313
+ if not data:
314
+ raise ValidationError("No files provided for extraction", context={"file_count": 0})
315
+
314
316
  min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
315
317
  max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
316
318
 
@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
144
144
  try:
145
145
  nlp = spacy.load(model_name)
146
146
  except OSError:
147
- # Try to download the model automatically
147
+
148
148
  async def install_model() -> tuple[bool, str | None]:
149
149
  """Install model and return success status and error message."""
150
- # First try spaCy's built-in download
151
150
  try:
152
151
  success = await install_spacy_model_with_spacy(model_name)
153
152
  if success:
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
157
156
  else:
158
157
  spacy_error = "spaCy download failed"
159
158
 
160
- # If spaCy download failed and uv is available, try uv as fallback
161
159
  if is_uv_available():
162
160
  try:
163
161
  result = await install_spacy_model_with_uv(model_name)
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
167
165
 
168
166
  return False, spacy_error
169
167
 
170
- # Run the async installation in a sync context
171
168
  try:
172
169
  success, error_details = anyio.run(install_model)
173
- except (OSError, RuntimeError) as e:
174
- success, error_details = False, str(e)
170
+ except SystemExit as e:
171
+ success, error_details = False, f"spaCy CLI exit code: {e.code}"
175
172
 
176
173
  if not success:
177
- # Generate appropriate error message based on available tools
178
174
  if is_uv_available():
179
175
  model_url = get_spacy_model_url(model_name)
180
176
  manual_install_cmd = f"uv pip install {model_url}"
@@ -234,7 +230,7 @@ def extract_keywords(
234
230
  kw_model = KeyBERT()
235
231
  keywords = kw_model.extract_keywords(text, top_n=keyword_count)
236
232
  return [(kw, float(score)) for kw, score in keywords]
237
- except (RuntimeError, OSError, ValueError):
233
+ except ValueError:
238
234
  return []
239
235
  except ImportError as e: # pragma: no cover
240
236
  raise MissingDependencyError.create_for_package(
@@ -0,0 +1,182 @@
1
+ """Type-safe error handling utilities for extraction pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import traceback
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+
11
+ from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
12
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
13
+
14
+
15
+ def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
16
+ """Determine if an exception should bubble up or be handled gracefully.
17
+
18
+ Args:
19
+ exception: The exception to classify
20
+ context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
21
+
22
+ Returns:
23
+ True if the exception should bubble up, False if it should be handled gracefully
24
+ """
25
+ if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
26
+ return True
27
+
28
+ if isinstance(exception, MissingDependencyError):
29
+ return True
30
+
31
+ if isinstance(exception, ValidationError):
32
+ if context == "batch_processing":
33
+ return False
34
+
35
+ return context != "optional_feature"
36
+
37
+ if isinstance(exception, KreuzbergError) and context == "optional_feature":
38
+ return False
39
+
40
+ if context == "batch_processing":
41
+ return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
42
+
43
+ return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
44
+
45
+
46
+ class FeatureProcessingError:
47
+ """Type-safe processing error for extraction features."""
48
+
49
+ def __init__(self, feature: str, error: Exception) -> None:
50
+ self._feature = feature
51
+ self._error = error
52
+ self._traceback = traceback.format_exc()
53
+
54
+ @property
55
+ def feature(self) -> str:
56
+ return self._feature
57
+
58
+ @property
59
+ def error_type(self) -> str:
60
+ return type(self._error).__name__
61
+
62
+ @property
63
+ def error_message(self) -> str:
64
+ return str(self._error)
65
+
66
+ @property
67
+ def traceback(self) -> str:
68
+ return self._traceback
69
+
70
+ def to_dict(self) -> ProcessingErrorDict:
71
+ return {
72
+ "feature": self.feature,
73
+ "error_type": self.error_type,
74
+ "error_message": self.error_message,
75
+ "traceback": self.traceback,
76
+ }
77
+
78
+
79
+ def safe_feature_execution(
80
+ feature_name: str,
81
+ execution_func: Callable[[], Any],
82
+ default_value: Any,
83
+ result: ExtractionResult,
84
+ context: ErrorContextType = "optional_feature",
85
+ ) -> Any:
86
+ """Safely execute a feature extraction function with proper error handling.
87
+
88
+ Args:
89
+ feature_name: Name of the feature being executed
90
+ execution_func: Function to execute that may raise exceptions
91
+ default_value: Default value to return if execution fails
92
+ result: ExtractionResult to update with error information
93
+ context: The context for exception handling decisions
94
+
95
+ Returns:
96
+ Either the successful result or the default value
97
+ """
98
+ try:
99
+ return execution_func()
100
+ except Exception as e:
101
+ if should_exception_bubble_up(e, context):
102
+ raise
103
+
104
+ _add_processing_error(result, FeatureProcessingError(feature_name, e))
105
+ return default_value
106
+
107
+
108
+ def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
109
+ """Add a processing error to the result metadata in a type-safe way."""
110
+ if result.metadata is None:
111
+ result.metadata = {}
112
+
113
+ if "processing_errors" not in result.metadata:
114
+ result.metadata["processing_errors"] = []
115
+
116
+ errors_list = result.metadata["processing_errors"]
117
+ if isinstance(errors_list, list):
118
+ errors_list.append(error.to_dict())
119
+ else:
120
+ result.metadata["processing_errors"] = [error.to_dict()]
121
+
122
+
123
+ def preserve_result_with_errors(
124
+ result: ExtractionResult,
125
+ errors: list[FeatureProcessingError],
126
+ ) -> ExtractionResult:
127
+ """Preserve a successful extraction result while adding error information.
128
+
129
+ This is used when core extraction succeeds but optional features fail.
130
+
131
+ Args:
132
+ result: The successful extraction result
133
+ errors: List of errors that occurred during optional processing
134
+
135
+ Returns:
136
+ The result with error information added to metadata
137
+ """
138
+ for error in errors:
139
+ _add_processing_error(result, error)
140
+
141
+ return result
142
+
143
+
144
+ def create_error_result(
145
+ content: str,
146
+ mime_type: str,
147
+ errors: list[FeatureProcessingError],
148
+ **metadata_kwargs: Any,
149
+ ) -> ExtractionResult:
150
+ """Create an error result with proper type safety.
151
+
152
+ Args:
153
+ content: Error content to include
154
+ mime_type: MIME type of the result
155
+ errors: List of errors that occurred
156
+ **metadata_kwargs: Additional metadata to include
157
+
158
+ Returns:
159
+ An ExtractionResult with error information
160
+ """
161
+ metadata: Metadata = {
162
+ "error": f"Multiple processing errors occurred: {len(errors)} errors",
163
+ "error_context": {
164
+ "error_count": len(errors),
165
+ "errors": [error.to_dict() for error in errors],
166
+ **metadata_kwargs,
167
+ },
168
+ "processing_errors": [error.to_dict() for error in errors],
169
+ }
170
+
171
+ return ExtractionResult(
172
+ content=content,
173
+ chunks=[],
174
+ mime_type=mime_type,
175
+ metadata=metadata,
176
+ entities=[],
177
+ keywords=[],
178
+ detected_languages=[],
179
+ tables=[],
180
+ images=[],
181
+ image_ocr_results=[],
182
+ )
@@ -230,13 +230,13 @@ class Extractor(ABC):
230
230
  confidence_score=None,
231
231
  processing_time=duration,
232
232
  )
233
- except (OSError, ValueError) as e: # pragma: no cover
233
+ except ValueError as e: # pragma: no cover
234
234
  return ImageOCRResult(
235
235
  image=target,
236
236
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
237
237
  skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
238
238
  )
239
- except (RuntimeError, TypeError) as e: # pragma: no cover
239
+ except TypeError as e: # pragma: no cover
240
240
  return ImageOCRResult(
241
241
  image=target,
242
242
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
+
6
+ from anyio import Path as AsyncPath
7
+ from html_to_markdown import HtmlToMarkdownError
8
+ from html_to_markdown._html_to_markdown import (
9
+ InlineImageConfig,
10
+ convert_with_inline_images,
11
+ )
12
+ from html_to_markdown._html_to_markdown import (
13
+ convert as rust_convert,
14
+ )
15
+
16
+ from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
17
+ from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
18
+ from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
19
+ from kreuzberg._utils._string import safe_decode
20
+ from kreuzberg._utils._sync import run_maybe_async, run_sync
21
+
22
+ if TYPE_CHECKING:
23
+ from pathlib import Path
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class HTMLExtractor(Extractor):
29
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
30
+
31
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
32
+ result = await run_sync(self.extract_bytes_sync, content)
33
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
34
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
35
+ return result
36
+
37
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
38
+ content = await AsyncPath(path).read_bytes()
39
+ result = await run_sync(self.extract_bytes_sync, content)
40
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
41
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
42
+ return result
43
+
44
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
45
+ extraction_config = self.config
46
+ html_content = safe_decode(content)
47
+ if extraction_config and extraction_config.html_to_markdown_config is not None:
48
+ html_config = extraction_config.html_to_markdown_config
49
+ else:
50
+ html_config = HTMLToMarkdownConfig()
51
+ conversion_options, _ = html_config.to_options()
52
+
53
+ extract_inline_images = bool(extraction_config and extraction_config.extract_images)
54
+ run_ocr_on_images = bool(
55
+ extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
56
+ )
57
+ inline_image_config = None
58
+ if extract_inline_images:
59
+ inline_image_config = InlineImageConfig(
60
+ max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
61
+ filename_prefix=None,
62
+ capture_svg=True,
63
+ infer_dimensions=True,
64
+ )
65
+
66
+ try:
67
+ if extract_inline_images:
68
+ markdown, images_payload, warnings = convert_with_inline_images(
69
+ html_content,
70
+ options=conversion_options,
71
+ image_config=inline_image_config,
72
+ )
73
+ else:
74
+ markdown = rust_convert(
75
+ html_content,
76
+ conversion_options,
77
+ )
78
+ images_payload = []
79
+ warnings = []
80
+ except (HtmlToMarkdownError, ValueError) as exc:
81
+ logger.exception("Failed to convert HTML to Markdown: %s", exc)
82
+ markdown = ""
83
+ images_payload = []
84
+ warnings = []
85
+
86
+ for warning in warnings:
87
+ self._log_inline_warning(warning)
88
+
89
+ extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
90
+
91
+ inline_images = [self._build_extracted_image(image) for image in images_payload]
92
+ if inline_images:
93
+ extraction_result.images = inline_images
94
+ if run_ocr_on_images:
95
+ extraction_result.image_ocr_results = run_maybe_async(
96
+ self._process_images_with_ocr,
97
+ inline_images,
98
+ )
99
+
100
+ return self._apply_quality_processing(extraction_result)
101
+
102
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
103
+ content = path.read_bytes()
104
+ return self.extract_bytes_sync(content)
105
+
106
+ @staticmethod
107
+ def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
108
+ dimensions_value = image.get("dimensions")
109
+ dimensions = tuple(dimensions_value) if dimensions_value else None
110
+ return ExtractedImage(
111
+ data=image["data"],
112
+ format=image["format"],
113
+ filename=image.get("filename"),
114
+ description=image.get("description"),
115
+ dimensions=dimensions,
116
+ )
117
+
118
+ @staticmethod
119
+ def _log_inline_warning(warning: Any) -> None:
120
+ if isinstance(warning, dict):
121
+ index = warning.get("index")
122
+ message = warning.get("message")
123
+ if index is not None and message:
124
+ logger.warning("Inline image %s: %s", index, message)
125
+ elif message:
126
+ logger.warning("Inline image warning: %s", message)
127
+ else:
128
+ logger.warning("Inline image warning received with no message")
129
+ return
130
+
131
+ message = getattr(warning, "message", None)
132
+ index = getattr(warning, "index", None)
133
+ if message and index is not None:
134
+ logger.warning("Inline image %s: %s", index, message)
135
+ elif message:
136
+ logger.warning("Inline image warning: %s", message)
137
+ else:
138
+ logger.warning("Inline image warning received with no message")
@@ -6,7 +6,6 @@ import logging
6
6
  import os
7
7
  import tempfile
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from dataclasses import asdict
10
9
  from itertools import count
11
10
  from multiprocessing import cpu_count
12
11
  from pathlib import Path
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
27
26
  from kreuzberg._ocr import get_ocr_backend
28
27
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
29
28
  from kreuzberg._types import (
30
- EasyOCRConfig,
31
29
  ExtractedImage,
32
30
  ExtractionResult,
33
31
  ImageOCRResult,
34
32
  Metadata,
35
33
  OcrBackendType,
36
- PaddleOCRConfig,
37
- TesseractConfig,
38
34
  )
39
35
  from kreuzberg._utils._errors import create_error_context, should_retry
40
36
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
134
130
  def extract_path_sync(self, path: Path) -> ExtractionResult:
135
131
  content_bytes = path.read_bytes()
136
132
 
133
+ result: ExtractionResult | None = None
134
+
137
135
  document: Document | None = None
138
136
  if self.config.extract_images or self.config.extract_tables:
139
137
  document = self._parse_with_password_attempts(content_bytes)
140
138
 
141
- try:
142
- text = self._extract_pdf_searchable_text_sync(path)
143
- except ParsingError:
144
- text = ""
139
+ if not self.config.force_ocr:
140
+ try:
141
+ content = self._extract_pdf_searchable_text_sync(path)
142
+ if self._validate_extracted_text(content):
143
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
144
+ except ParsingError:
145
+ pass
145
146
 
146
- if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
147
- text = self._extract_pdf_with_ocr_sync(path)
147
+ if not result and self.config.ocr_backend is not None:
148
+ result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
149
+
150
+ if not result:
151
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
152
+
153
+ metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
154
+ result.metadata = metadata
148
155
 
149
- tables = []
150
156
  if self.config.extract_tables:
151
157
  # GMFT is optional dependency ~keep
152
158
  try:
153
159
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
154
160
 
155
161
  tables = extract_tables_sync(path)
162
+ result.tables = tables
156
163
  except ImportError: # pragma: no cover
157
- tables = []
158
-
159
- if not self.config.force_ocr and self._validate_extracted_text(text):
160
- text = self._extract_with_playa_sync(path, fallback_text=text)
161
-
162
- text = normalize_spaces(text)
163
-
164
- result = ExtractionResult(
165
- content=text,
166
- mime_type=PLAIN_TEXT_MIME_TYPE,
167
- metadata={},
168
- tables=list(tables),
169
- )
164
+ result.tables = []
170
165
 
171
- if tables:
172
- table_summary = generate_table_summary(tables)
173
- result.metadata = result.metadata | {
174
- "table_count": table_summary["table_count"],
175
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
176
- f"across {table_summary['pages_with_tables']} pages with "
177
- f"{table_summary['total_rows']} total rows",
178
- }
166
+ if result.tables:
167
+ table_summary = generate_table_summary(result.tables)
168
+ result.metadata = result.metadata | {
169
+ "table_count": table_summary["table_count"],
170
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
171
+ f"across {table_summary['pages_with_tables']} pages with "
172
+ f"{table_summary['total_rows']} total rows",
173
+ }
179
174
 
180
175
  if self.config.extract_images and document:
181
176
  images = self._extract_images_from_playa_sync(document)
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
405
400
  except Exception as e:
406
401
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
407
402
 
408
- def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
403
+ def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
409
404
  temp_files: list[Path] = []
410
405
  try:
411
406
  with pdf_document_sync(path) as pdf:
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
443
438
  with pdf_resources_sync(bitmap, page):
444
439
  pil_image.close()
445
440
 
446
- return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
441
+ content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
442
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
447
443
 
448
444
  except Exception as e:
449
445
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
452
448
  with contextlib.suppress(OSError):
453
449
  p.unlink()
454
450
 
455
- def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
456
- backend = get_ocr_backend(self.config.ocr_backend)
451
+ def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
452
+ backend = get_ocr_backend(ocr_backend)
457
453
  paths = [Path(p) for p in image_paths]
458
454
 
459
- match self.config.ocr_backend:
460
- case "tesseract":
461
- config = (
462
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
463
- )
464
- results = backend.process_batch_sync(paths, **asdict(config))
465
- case "paddleocr":
466
- paddle_config = (
467
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
468
- )
469
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
470
- case "easyocr":
471
- easy_config = (
472
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
473
- )
474
- results = backend.process_batch_sync(paths, **asdict(easy_config))
475
- case _:
476
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
455
+ results = backend.process_batch_sync(paths, **self.config.get_config_dict())
477
456
 
478
457
  return "\n\n".join(result.content for result in results)
479
458
 
@@ -14,7 +14,7 @@ else: # pragma: no cover
14
14
  try:
15
15
  import yaml
16
16
  except ImportError: # pragma: no cover
17
- yaml = None
17
+ yaml = None # type: ignore[assignment]
18
18
 
19
19
 
20
20
  from anyio import Path as AsyncPath