kreuzberg 3.17.3__tar.gz → 3.20.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (360) hide show
  1. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/PKG-INFO +33 -46
  2. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_api/main.py +45 -3
  3. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_entity_extraction.py +108 -18
  4. kreuzberg-3.20.1/kreuzberg/_error_handling.py +182 -0
  5. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_base.py +2 -2
  6. kreuzberg-3.20.1/kreuzberg/_extractors/_html.py +138 -0
  7. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pdf.py +33 -54
  8. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_structured.py +1 -1
  9. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_language_detection.py +2 -0
  10. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_tesseract.py +76 -297
  11. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_types.py +143 -47
  12. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/cli.py +36 -22
  13. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/extraction.py +251 -107
  14. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/pyproject.toml +59 -75
  15. kreuzberg-3.17.3/.commitlintrc +0 -1
  16. kreuzberg-3.17.3/.deepsource.toml +0 -54
  17. kreuzberg-3.17.3/.docker/Dockerfile +0 -79
  18. kreuzberg-3.17.3/.docker/README.md +0 -190
  19. kreuzberg-3.17.3/.dockerignore +0 -15
  20. kreuzberg-3.17.3/.github/dependabot.yaml +0 -6
  21. kreuzberg-3.17.3/.github/workflows/ci.yaml +0 -381
  22. kreuzberg-3.17.3/.github/workflows/docker-e2e-tests.yml +0 -150
  23. kreuzberg-3.17.3/.github/workflows/docs.yml +0 -66
  24. kreuzberg-3.17.3/.github/workflows/pr-title.yaml +0 -20
  25. kreuzberg-3.17.3/.github/workflows/publish-docker.yml +0 -163
  26. kreuzberg-3.17.3/.github/workflows/release.yaml +0 -37
  27. kreuzberg-3.17.3/.github/workflows/test-docker-builds.yml +0 -101
  28. kreuzberg-3.17.3/.gitignore +0 -74
  29. kreuzberg-3.17.3/.markdownlint.yaml +0 -17
  30. kreuzberg-3.17.3/.pre-commit-config.yaml +0 -82
  31. kreuzberg-3.17.3/.prettierignore +0 -1
  32. kreuzberg-3.17.3/ATTRIBUTIONS.md +0 -47
  33. kreuzberg-3.17.3/LICENSE +0 -7
  34. kreuzberg-3.17.3/Taskfile.yml +0 -50
  35. kreuzberg-3.17.3/ai-rulez.yaml +0 -586
  36. kreuzberg-3.17.3/benchmarks/README.md +0 -264
  37. kreuzberg-3.17.3/benchmarks/batch_size_benchmark.py +0 -179
  38. kreuzberg-3.17.3/benchmarks/batch_validation_benchmark.py +0 -83
  39. kreuzberg-3.17.3/benchmarks/pyproject.toml +0 -29
  40. kreuzberg-3.17.3/benchmarks/src/__init__.py +0 -1
  41. kreuzberg-3.17.3/benchmarks/src/__main__.py +0 -4
  42. kreuzberg-3.17.3/benchmarks/src/benchmarks.py +0 -703
  43. kreuzberg-3.17.3/benchmarks/src/cli.py +0 -723
  44. kreuzberg-3.17.3/benchmarks/src/models.py +0 -195
  45. kreuzberg-3.17.3/benchmarks/src/profiler.py +0 -161
  46. kreuzberg-3.17.3/benchmarks/src/runner.py +0 -367
  47. kreuzberg-3.17.3/benchmarks/token_reduction_compression_benchmark.py +0 -268
  48. kreuzberg-3.17.3/docs/advanced/custom-extractors.md +0 -203
  49. kreuzberg-3.17.3/docs/advanced/custom-hooks.md +0 -148
  50. kreuzberg-3.17.3/docs/advanced/error-handling.md +0 -181
  51. kreuzberg-3.17.3/docs/advanced/index.md +0 -41
  52. kreuzberg-3.17.3/docs/advanced/performance.md +0 -306
  53. kreuzberg-3.17.3/docs/api-reference/exceptions.md +0 -33
  54. kreuzberg-3.17.3/docs/api-reference/extraction-functions.md +0 -59
  55. kreuzberg-3.17.3/docs/api-reference/extractor-registry.md +0 -5
  56. kreuzberg-3.17.3/docs/api-reference/index.md +0 -51
  57. kreuzberg-3.17.3/docs/api-reference/ocr-configuration.md +0 -27
  58. kreuzberg-3.17.3/docs/api-reference/types.md +0 -120
  59. kreuzberg-3.17.3/docs/assets/favicon.png +0 -0
  60. kreuzberg-3.17.3/docs/assets/logo.png +0 -0
  61. kreuzberg-3.17.3/docs/cli.md +0 -225
  62. kreuzberg-3.17.3/docs/contributing.md +0 -82
  63. kreuzberg-3.17.3/docs/css/extra.css +0 -56
  64. kreuzberg-3.17.3/docs/examples/extraction-examples.md +0 -763
  65. kreuzberg-3.17.3/docs/examples/index.md +0 -48
  66. kreuzberg-3.17.3/docs/getting-started/index.md +0 -20
  67. kreuzberg-3.17.3/docs/getting-started/installation.md +0 -154
  68. kreuzberg-3.17.3/docs/getting-started/quick-start.md +0 -111
  69. kreuzberg-3.17.3/docs/index.md +0 -60
  70. kreuzberg-3.17.3/docs/user-guide/api-server.md +0 -500
  71. kreuzberg-3.17.3/docs/user-guide/basic-usage.md +0 -161
  72. kreuzberg-3.17.3/docs/user-guide/chunking.md +0 -124
  73. kreuzberg-3.17.3/docs/user-guide/docker.md +0 -548
  74. kreuzberg-3.17.3/docs/user-guide/document-classification.md +0 -61
  75. kreuzberg-3.17.3/docs/user-guide/extraction-configuration.md +0 -966
  76. kreuzberg-3.17.3/docs/user-guide/index.md +0 -45
  77. kreuzberg-3.17.3/docs/user-guide/mcp-server.md +0 -586
  78. kreuzberg-3.17.3/docs/user-guide/metadata-extraction.md +0 -125
  79. kreuzberg-3.17.3/docs/user-guide/ocr-backends.md +0 -247
  80. kreuzberg-3.17.3/docs/user-guide/ocr-configuration.md +0 -414
  81. kreuzberg-3.17.3/docs/user-guide/supported-formats.md +0 -71
  82. kreuzberg-3.17.3/docs/user-guide/token-reduction.md +0 -251
  83. kreuzberg-3.17.3/kreuzberg/_extractors/_html.py +0 -148
  84. kreuzberg-3.17.3/kreuzberg/_utils/__init__.py +0 -0
  85. kreuzberg-3.17.3/kreuzberg/_utils/_html_streaming.py +0 -20
  86. kreuzberg-3.17.3/kreuzberg/py.typed +0 -0
  87. kreuzberg-3.17.3/mkdocs.yaml +0 -160
  88. kreuzberg-3.17.3/tests/__init__.py +0 -0
  89. kreuzberg-3.17.3/tests/api/__init__.py +0 -0
  90. kreuzberg-3.17.3/tests/api/config_cache_test.py +0 -224
  91. kreuzberg-3.17.3/tests/api/conftest.py +0 -18
  92. kreuzberg-3.17.3/tests/api/header_config_hashing_test.py +0 -29
  93. kreuzberg-3.17.3/tests/api/image_extraction_test.py +0 -59
  94. kreuzberg-3.17.3/tests/api/main_test.py +0 -817
  95. kreuzberg-3.17.3/tests/api/runtime_config_test.py +0 -374
  96. kreuzberg-3.17.3/tests/conftest.py +0 -219
  97. kreuzberg-3.17.3/tests/core/__init__.py +0 -0
  98. kreuzberg-3.17.3/tests/core/comprehensive_config_test.py +0 -664
  99. kreuzberg-3.17.3/tests/core/config_test.py +0 -15
  100. kreuzberg-3.17.3/tests/core/constants_test.py +0 -22
  101. kreuzberg-3.17.3/tests/core/dpi_configuration_test.py +0 -319
  102. kreuzberg-3.17.3/tests/core/exceptions_test.py +0 -159
  103. kreuzberg-3.17.3/tests/core/extraction_batch_test.py +0 -389
  104. kreuzberg-3.17.3/tests/core/extraction_test.py +0 -494
  105. kreuzberg-3.17.3/tests/core/html_to_markdown_config_test.py +0 -0
  106. kreuzberg-3.17.3/tests/core/image_ocr_result_test.py +0 -27
  107. kreuzberg-3.17.3/tests/core/init_test.py +0 -85
  108. kreuzberg-3.17.3/tests/core/main_test.py +0 -35
  109. kreuzberg-3.17.3/tests/core/mime_types_test.py +0 -242
  110. kreuzberg-3.17.3/tests/core/registry_test.py +0 -225
  111. kreuzberg-3.17.3/tests/core/types_test.py +0 -465
  112. kreuzberg-3.17.3/tests/e2e/__init__.py +0 -0
  113. kreuzberg-3.17.3/tests/e2e/docker_e2e.py +0 -481
  114. kreuzberg-3.17.3/tests/extractors/README_image_tests.md +0 -85
  115. kreuzberg-3.17.3/tests/extractors/__init__.py +0 -0
  116. kreuzberg-3.17.3/tests/extractors/base_extractor_test.py +0 -420
  117. kreuzberg-3.17.3/tests/extractors/base_memory_limits_test.py +0 -100
  118. kreuzberg-3.17.3/tests/extractors/base_ocr_processing_test.py +0 -276
  119. kreuzberg-3.17.3/tests/extractors/base_ocr_simple_test.py +0 -64
  120. kreuzberg-3.17.3/tests/extractors/email_error_paths_test.py +0 -39
  121. kreuzberg-3.17.3/tests/extractors/email_test.py +0 -948
  122. kreuzberg-3.17.3/tests/extractors/html_invalid_base64_test.py +0 -11
  123. kreuzberg-3.17.3/tests/extractors/html_test.py +0 -52
  124. kreuzberg-3.17.3/tests/extractors/image_deduplication_test.py +0 -87
  125. kreuzberg-3.17.3/tests/extractors/image_error_handling_test.py +0 -253
  126. kreuzberg-3.17.3/tests/extractors/image_error_simple_test.py +0 -75
  127. kreuzberg-3.17.3/tests/extractors/image_test.py +0 -766
  128. kreuzberg-3.17.3/tests/extractors/json_test.py +0 -427
  129. kreuzberg-3.17.3/tests/extractors/pandoc_metadata_test.py +0 -323
  130. kreuzberg-3.17.3/tests/extractors/pandoc_test.py +0 -1995
  131. kreuzberg-3.17.3/tests/extractors/pdf_images_test.py +0 -52
  132. kreuzberg-3.17.3/tests/extractors/pdf_sync_images_test.py +0 -217
  133. kreuzberg-3.17.3/tests/extractors/pdf_test.py +0 -905
  134. kreuzberg-3.17.3/tests/extractors/presentation_test.py +0 -967
  135. kreuzberg-3.17.3/tests/extractors/spreadsheet_test.py +0 -1140
  136. kreuzberg-3.17.3/tests/extractors/structured_test.py +0 -304
  137. kreuzberg-3.17.3/tests/features/__init__.py +0 -0
  138. kreuzberg-3.17.3/tests/features/chunker_test.py +0 -94
  139. kreuzberg-3.17.3/tests/features/document_classification_test.py +0 -747
  140. kreuzberg-3.17.3/tests/features/entity_extraction_test.py +0 -404
  141. kreuzberg-3.17.3/tests/features/gmft_test.py +0 -1496
  142. kreuzberg-3.17.3/tests/features/hooks_test.py +0 -0
  143. kreuzberg-3.17.3/tests/features/language_detection_test.py +0 -343
  144. kreuzberg-3.17.3/tests/features/table_extraction_test.py +0 -0
  145. kreuzberg-3.17.3/tests/features/token_reduction_test.py +0 -813
  146. kreuzberg-3.17.3/tests/integration/__init__.py +0 -0
  147. kreuzberg-3.17.3/tests/integration/all_extractors_images_test.py +0 -252
  148. kreuzberg-3.17.3/tests/integration/api/__init__.py +0 -0
  149. kreuzberg-3.17.3/tests/integration/api/large_file_test.py +0 -0
  150. kreuzberg-3.17.3/tests/integration/api/mounted_config_test.py +0 -0
  151. kreuzberg-3.17.3/tests/integration/dpi_integration_test.py +0 -209
  152. kreuzberg-3.17.3/tests/integration/multiprocessing/__init__.py +0 -0
  153. kreuzberg-3.17.3/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  154. kreuzberg-3.17.3/tests/integration/ocr/__init__.py +0 -0
  155. kreuzberg-3.17.3/tests/integration/ocr/device_integration_test.py +0 -0
  156. kreuzberg-3.17.3/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  157. kreuzberg-3.17.3/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  158. kreuzberg-3.17.3/tests/integration/pandoc_images_test.py +0 -30
  159. kreuzberg-3.17.3/tests/integration/pdf_images_test.py +0 -18
  160. kreuzberg-3.17.3/tests/integration/pdf_real_images_test.py +0 -52
  161. kreuzberg-3.17.3/tests/integration/pptx_complex_test.py +0 -22
  162. kreuzberg-3.17.3/tests/integration/pptx_images_test.py +0 -18
  163. kreuzberg-3.17.3/tests/integration/regression_test.py +0 -134
  164. kreuzberg-3.17.3/tests/integration/token_reduction_integration_test.py +0 -173
  165. kreuzberg-3.17.3/tests/interfaces/__init__.py +0 -0
  166. kreuzberg-3.17.3/tests/interfaces/cli_test.py +0 -527
  167. kreuzberg-3.17.3/tests/interfaces/mcp_server_test.py +0 -1116
  168. kreuzberg-3.17.3/tests/mcp/__init__.py +0 -0
  169. kreuzberg-3.17.3/tests/mcp/mcp_server_test.py +0 -0
  170. kreuzberg-3.17.3/tests/multiprocessing/__init__.py +0 -0
  171. kreuzberg-3.17.3/tests/multiprocessing/gmft_isolated_test.py +0 -449
  172. kreuzberg-3.17.3/tests/multiprocessing/process_manager_test.py +0 -273
  173. kreuzberg-3.17.3/tests/multiprocessing/tesseract_pool_test.py +0 -331
  174. kreuzberg-3.17.3/tests/ocr/__init__.py +0 -0
  175. kreuzberg-3.17.3/tests/ocr/base_test.py +0 -80
  176. kreuzberg-3.17.3/tests/ocr/easyocr_test.py +0 -517
  177. kreuzberg-3.17.3/tests/ocr/init_test.py +0 -35
  178. kreuzberg-3.17.3/tests/ocr/paddleocr_test.py +0 -835
  179. kreuzberg-3.17.3/tests/ocr/tesseract_test.py +0 -1314
  180. kreuzberg-3.17.3/tests/ocr/tesseract_tsv_test.py +0 -409
  181. kreuzberg-3.17.3/tests/performance/__init__.py +0 -0
  182. kreuzberg-3.17.3/tests/performance/large_pdf_perf_test.py +0 -29
  183. kreuzberg-3.17.3/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  184. kreuzberg-3.17.3/tests/test_source_files/contract.txt +0 -1
  185. kreuzberg-3.17.3/tests/test_source_files/contract_test.txt +0 -4
  186. kreuzberg-3.17.3/tests/test_source_files/document.docx +0 -0
  187. kreuzberg-3.17.3/tests/test_source_files/email/sample-email.eml +0 -11
  188. kreuzberg-3.17.3/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  189. kreuzberg-3.17.3/tests/test_source_files/excel.xlsx +0 -0
  190. kreuzberg-3.17.3/tests/test_source_files/flower-no-text.jpg +0 -0
  191. kreuzberg-3.17.3/tests/test_source_files/form_test.txt +0 -5
  192. kreuzberg-3.17.3/tests/test_source_files/french-text.txt +0 -2
  193. kreuzberg-3.17.3/tests/test_source_files/german-text.txt +0 -2
  194. kreuzberg-3.17.3/tests/test_source_files/google-doc-document.pdf +0 -0
  195. kreuzberg-3.17.3/tests/test_source_files/html.html +0 -10
  196. kreuzberg-3.17.3/tests/test_source_files/images/test_hello_world.png +0 -0
  197. kreuzberg-3.17.3/tests/test_source_files/invoice_image.png +0 -0
  198. kreuzberg-3.17.3/tests/test_source_files/invoice_test.txt +0 -4
  199. kreuzberg-3.17.3/tests/test_source_files/json/complex_nested.json +0 -41
  200. kreuzberg-3.17.3/tests/test_source_files/json/real_world/aws_policy.json +0 -43
  201. kreuzberg-3.17.3/tests/test_source_files/json/real_world/earthquakes.geojson +0 -6
  202. kreuzberg-3.17.3/tests/test_source_files/json/real_world/github_emojis.json +0 -111
  203. kreuzberg-3.17.3/tests/test_source_files/json/real_world/iss_location.json +0 -1
  204. kreuzberg-3.17.3/tests/test_source_files/json/real_world/openapi_spec.json +0 -84
  205. kreuzberg-3.17.3/tests/test_source_files/json/real_world/package.json +0 -33
  206. kreuzberg-3.17.3/tests/test_source_files/json/real_world/rick_morty_character.json +0 -1
  207. kreuzberg-3.17.3/tests/test_source_files/json/sample-document.json +0 -1
  208. kreuzberg-3.17.3/tests/test_source_files/json/schema_test.json +0 -25
  209. kreuzberg-3.17.3/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  210. kreuzberg-3.17.3/tests/test_source_files/markdown.md +0 -1
  211. kreuzberg-3.17.3/tests/test_source_files/non-ascii-text.pdf +0 -0
  212. kreuzberg-3.17.3/tests/test_source_files/non-searchable.pdf +0 -0
  213. kreuzberg-3.17.3/tests/test_source_files/ocr-image.jpg +0 -0
  214. kreuzberg-3.17.3/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  215. kreuzberg-3.17.3/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  216. kreuzberg-3.17.3/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  217. kreuzberg-3.17.3/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  218. kreuzberg-3.17.3/tests/test_source_files/receipt_test.txt +0 -5
  219. kreuzberg-3.17.3/tests/test_source_files/report_test.txt +0 -4
  220. kreuzberg-3.17.3/tests/test_source_files/sample-contract.pdf +0 -0
  221. kreuzberg-3.17.3/tests/test_source_files/scanned.pdf +0 -0
  222. kreuzberg-3.17.3/tests/test_source_files/searchable.pdf +0 -0
  223. kreuzberg-3.17.3/tests/test_source_files/sharable-web-guide.pdf +0 -0
  224. kreuzberg-3.17.3/tests/test_source_files/spanish-text.txt +0 -2
  225. kreuzberg-3.17.3/tests/test_source_files/tables/borderless_table.png +0 -0
  226. kreuzberg-3.17.3/tests/test_source_files/tables/complex_document.png +0 -0
  227. kreuzberg-3.17.3/tests/test_source_files/tables/simple_table.png +0 -0
  228. kreuzberg-3.17.3/tests/test_source_files/test-article.pdf +0 -0
  229. kreuzberg-3.17.3/tests/test_source_files/test-excel.xls +0 -0
  230. kreuzberg-3.17.3/tests/test_source_files/yaml/sample-config.yaml +0 -15
  231. kreuzberg-3.17.3/tests/utils/__init__.py +0 -0
  232. kreuzberg-3.17.3/tests/utils/cache_test.py +0 -427
  233. kreuzberg-3.17.3/tests/utils/device_test.py +0 -347
  234. kreuzberg-3.17.3/tests/utils/errors_test.py +0 -343
  235. kreuzberg-3.17.3/tests/utils/ocr_cache_test.py +0 -286
  236. kreuzberg-3.17.3/tests/utils/pdf_lock_test.py +0 -215
  237. kreuzberg-3.17.3/tests/utils/playa_helpers_test.py +0 -0
  238. kreuzberg-3.17.3/tests/utils/playa_metadata_test.py +0 -753
  239. kreuzberg-3.17.3/tests/utils/playa_test.py +0 -315
  240. kreuzberg-3.17.3/tests/utils/process_pool_test.py +0 -223
  241. kreuzberg-3.17.3/tests/utils/quality_test.py +0 -121
  242. kreuzberg-3.17.3/tests/utils/ref_test.py +0 -90
  243. kreuzberg-3.17.3/tests/utils/serialization_test.py +0 -379
  244. kreuzberg-3.17.3/tests/utils/string_test.py +0 -251
  245. kreuzberg-3.17.3/tests/utils/sync_test.py +0 -259
  246. kreuzberg-3.17.3/tests/utils/table_test.py +0 -353
  247. kreuzberg-3.17.3/tests/utils/tmp_test.py +0 -50
  248. kreuzberg-3.17.3/uv.lock +0 -6184
  249. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/README.md +0 -0
  250. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/__init__.py +0 -0
  251. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/__main__.py +0 -0
  252. {kreuzberg-3.17.3/benchmarks → kreuzberg-3.20.1/kreuzberg/_api}/__init__.py +0 -0
  253. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_api/_config_cache.py +0 -0
  254. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_chunker.py +0 -0
  255. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_config.py +0 -0
  256. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_constants.py +0 -0
  257. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_document_classification.py +0 -0
  258. {kreuzberg-3.17.3/kreuzberg/_api → kreuzberg-3.20.1/kreuzberg/_extractors}/__init__.py +0 -0
  259. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_email.py +0 -0
  260. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_image.py +0 -0
  261. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  262. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_presentation.py +0 -0
  263. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  264. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_gmft.py +0 -0
  265. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_mcp/__init__.py +0 -0
  266. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_mcp/server.py +0 -0
  267. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_mime_types.py +0 -0
  268. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/__init__.py +0 -0
  269. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_base.py +0 -0
  270. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_easyocr.py +0 -0
  271. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
  272. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
  273. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_playa.py +0 -0
  274. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_registry.py +0 -0
  275. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/__init__.py +0 -0
  276. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_reducer.py +0 -0
  277. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  278. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  279. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  280. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  281. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  282. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  283. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  284. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  285. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  286. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  287. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  288. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  289. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  290. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  291. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  292. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  293. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  294. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  295. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  296. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  297. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  298. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  299. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  300. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  301. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  302. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  303. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  304. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  305. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  306. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  307. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  308. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  309. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  310. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  311. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  312. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  313. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  314. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  315. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  316. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  317. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  318. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  319. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  320. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  321. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  322. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  323. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  324. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  325. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  326. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  327. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  328. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  329. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  330. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  331. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  332. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  333. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  334. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  335. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  336. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  337. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  338. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  339. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  340. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  341. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  342. {kreuzberg-3.17.3/kreuzberg/_extractors → kreuzberg-3.20.1/kreuzberg/_utils}/__init__.py +0 -0
  343. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_cache.py +0 -0
  344. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_device.py +0 -0
  345. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_document_cache.py +0 -0
  346. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_errors.py +0 -0
  347. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  348. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
  349. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  350. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_process_pool.py +0 -0
  351. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_quality.py +0 -0
  352. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_ref.py +0 -0
  353. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_resource_managers.py +0 -0
  354. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_serialization.py +0 -0
  355. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_string.py +0 -0
  356. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_sync.py +0 -0
  357. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_table.py +0 -0
  358. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_tmp.py +0 -0
  359. {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/exceptions.py +0 -0
  360. {kreuzberg-3.17.3/benchmarks → kreuzberg-3.20.1/kreuzberg}/py.typed +0 -0
@@ -1,13 +1,11 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: kreuzberg
3
- Version: 3.17.3
3
+ Version: 3.20.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
- Project-URL: documentation, https://kreuzberg.dev
6
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
5
+ Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
6
+ Author: Na'aman Hirschfeld
7
7
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
8
  License: MIT
9
- License-File: LICENSE
10
- Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
9
  Classifier: Development Status :: 5 - Production/Stable
12
10
  Classifier: Intended Audience :: Developers
13
11
  Classifier: Intended Audience :: Information Technology
@@ -27,67 +25,56 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
25
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
26
  Classifier: Topic :: Text Processing :: General
29
27
  Classifier: Typing :: Typed
30
- Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.10.0
28
+ Requires-Dist: anyio>=4.11.0
32
29
  Requires-Dist: chardetng-py>=0.3.5
33
- Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.14.0
30
+ Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
31
+ Requires-Dist: html-to-markdown>=2.1.0
35
32
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.1
33
+ Requires-Dist: mcp>=1.17.0
37
34
  Requires-Dist: msgspec>=0.18.0
38
35
  Requires-Dist: numpy>=2.0.0
39
36
  Requires-Dist: playa-pdf>=0.7.0
40
- Requires-Dist: polars>=1.33.1
37
+ Requires-Dist: polars>=1.34.0
41
38
  Requires-Dist: psutil>=7.1.0
42
39
  Requires-Dist: pypdfium2==4.30.0
43
40
  Requires-Dist: python-calamine>=0.5.3
44
41
  Requires-Dist: python-pptx>=1.0.2
45
- Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
42
+ Requires-Dist: transformers>=4.55.0
43
+ Requires-Dist: typing-extensions>=4.15.0 ; python_full_version < '3.12'
44
+ Requires-Dist: mailparse>=1.0.15 ; extra == 'additional-extensions'
45
+ Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'additional-extensions'
46
+ Requires-Dist: kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr] ; extra == 'all'
47
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0 ; extra == 'api'
48
+ Requires-Dist: semantic-text-splitter>=0.28.0 ; extra == 'chunking'
49
+ Requires-Dist: click>=8.3.0 ; extra == 'cli'
50
+ Requires-Dist: rich>=14.2.0 ; extra == 'cli'
51
+ Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'cli'
52
+ Requires-Dist: playa-pdf[crypto]>=0.7.0 ; extra == 'crypto'
53
+ Requires-Dist: deep-translator>=1.11.4 ; extra == 'document-classification'
54
+ Requires-Dist: easyocr>=1.7.2 ; python_full_version < '3.14' and extra == 'easyocr'
55
+ Requires-Dist: keybert>=0.9.0 ; extra == 'entity-extraction'
56
+ Requires-Dist: spacy>=3.8.7 ; python_full_version < '3.14' and extra == 'entity-extraction'
57
+ Requires-Dist: gmft>=0.4.2 ; extra == 'gmft'
58
+ Requires-Dist: transformers>=4.57.0 ; extra == 'gmft'
59
+ Requires-Dist: fast-langdetect>=1.0.0 ; extra == 'langdetect'
60
+ Requires-Dist: paddleocr>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
61
+ Requires-Dist: paddlepaddle>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
62
+ Requires-Dist: setuptools>=80.9.0 ; extra == 'paddleocr'
63
+ Requires-Python: >=3.10
64
+ Project-URL: documentation, https://kreuzberg.dev
65
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
46
66
  Provides-Extra: additional-extensions
47
- Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
48
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
49
67
  Provides-Extra: all
50
- Requires-Dist: click>=8.2.1; extra == 'all'
51
- Requires-Dist: deep-translator>=1.11.4; extra == 'all'
52
- Requires-Dist: easyocr>=1.7.2; extra == 'all'
53
- Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
54
- Requires-Dist: gmft>=0.4.2; extra == 'all'
55
- Requires-Dist: keybert>=0.9.0; extra == 'all'
56
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
57
- Requires-Dist: mailparse>=1.0.15; extra == 'all'
58
- Requires-Dist: paddleocr>=3.2.0; extra == 'all'
59
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
60
- Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
61
- Requires-Dist: rich>=14.1.0; extra == 'all'
62
- Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
63
- Requires-Dist: setuptools>=80.9.0; extra == 'all'
64
- Requires-Dist: spacy>=3.8.7; extra == 'all'
65
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
66
68
  Provides-Extra: api
67
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
68
69
  Provides-Extra: chunking
69
- Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
70
70
  Provides-Extra: cli
71
- Requires-Dist: click>=8.2.1; extra == 'cli'
72
- Requires-Dist: rich>=14.1.0; extra == 'cli'
73
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
74
71
  Provides-Extra: crypto
75
- Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
76
72
  Provides-Extra: document-classification
77
- Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
78
73
  Provides-Extra: easyocr
79
- Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
80
74
  Provides-Extra: entity-extraction
81
- Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
82
- Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
83
75
  Provides-Extra: gmft
84
- Requires-Dist: gmft>=0.4.2; extra == 'gmft'
85
76
  Provides-Extra: langdetect
86
- Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
87
77
  Provides-Extra: paddleocr
88
- Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
89
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
90
- Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
91
78
  Description-Content-Type: text/markdown
92
79
 
93
80
  # Kreuzberg
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import base64
4
4
  import io
5
+ import os
5
6
  import traceback
6
7
  from json import dumps
7
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -100,6 +101,35 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
100
101
  )
101
102
 
102
103
 
104
+ def _get_max_upload_size() -> int:
105
+ """Get the maximum upload size from environment variable.
106
+
107
+ Returns:
108
+ Maximum upload size in bytes. Defaults to 1GB if not set.
109
+
110
+ Environment Variables:
111
+ KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
+ """
113
+ default_size = 1024 * 1024 * 1024
114
+ try:
115
+ size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
+ return size if size >= 0 else default_size
117
+ except ValueError:
118
+ return default_size
119
+
120
+
121
+ def _is_opentelemetry_enabled() -> bool:
122
+ """Check if OpenTelemetry should be enabled.
123
+
124
+ Returns:
125
+ True if OpenTelemetry should be enabled, False otherwise.
126
+
127
+ Environment Variables:
128
+ KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
129
+ """
130
+ return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
131
+
132
+
103
133
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
104
134
  error_type = type(exception).__name__
105
135
  error_message = str(exception)
@@ -242,7 +272,7 @@ async def handle_files_upload( # noqa: PLR0913
242
272
  - Language detection (if enabled)
243
273
 
244
274
  Supports various file formats including PDF, Office documents, images, and more.
245
- Maximum file size: 1GB per file.
275
+ Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
246
276
 
247
277
  Args:
248
278
  request: The HTTP request object
@@ -280,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
280
310
  """
281
311
  static_config = discover_config_cached()
282
312
 
313
+ if not data:
314
+ raise ValidationError("No files provided for extraction", context={"file_count": 0})
315
+
283
316
  min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
284
317
  max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
285
318
 
@@ -379,9 +412,18 @@ type_encoders = {
379
412
  Image.Image: _pil_image_encoder,
380
413
  }
381
414
 
415
+
416
+ def _get_plugins() -> list[Any]:
417
+ """Get configured plugins based on environment variables."""
418
+ plugins = []
419
+ if _is_opentelemetry_enabled():
420
+ plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
421
+ return plugins
422
+
423
+
382
424
  app = Litestar(
383
425
  route_handlers=[handle_files_upload, health_check, get_configuration],
384
- plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
426
+ plugins=_get_plugins(),
385
427
  logging_config=StructLoggingConfig(),
386
428
  openapi_config=openapi_config,
387
429
  exception_handlers={
@@ -389,5 +431,5 @@ app = Litestar(
389
431
  Exception: general_exception_handler,
390
432
  },
391
433
  type_encoders=type_encoders,
392
- request_max_body_size=1024 * 1024 * 1024,
434
+ request_max_body_size=_get_max_upload_size(),
393
435
  )
@@ -2,19 +2,77 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
+ import shutil
5
6
  import subprocess
6
- import sys
7
7
  from functools import lru_cache
8
8
  from itertools import chain
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
+ import anyio
12
+
11
13
  from kreuzberg._types import Entity, SpacyEntityExtractionConfig
14
+ from kreuzberg._utils._sync import run_sync
12
15
  from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
13
16
 
14
17
  if TYPE_CHECKING:
15
18
  from collections.abc import Sequence
16
19
 
17
20
 
21
+ def is_uv_available() -> bool:
22
+ """Check if uv is available in the environment."""
23
+ return shutil.which("uv") is not None
24
+
25
+
26
+ def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
27
+ """Get the direct download URL for a spaCy model.
28
+
29
+ Args:
30
+ model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
31
+ version: Model version to download (default: 3.8.0)
32
+
33
+ Returns:
34
+ Direct download URL for the model
35
+ """
36
+ return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
37
+
38
+
39
+ async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
40
+ """Install spaCy model using uv.
41
+
42
+ Args:
43
+ model_name: Name of the spaCy model to install
44
+
45
+ Returns:
46
+ Completed process result
47
+ """
48
+ model_url = get_spacy_model_url(model_name)
49
+ return await run_sync(
50
+ subprocess.run,
51
+ ["uv", "pip", "install", model_url],
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ )
56
+
57
+
58
+ async def install_spacy_model_with_spacy(model_name: str) -> bool:
59
+ """Install spaCy model using spacy download function.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to install
63
+
64
+ Returns:
65
+ True if successful, False otherwise
66
+ """
67
+ try:
68
+ import spacy.cli.download # noqa: PLC0415
69
+
70
+ await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
71
+ return True
72
+ except (ImportError, OSError, RuntimeError):
73
+ return False
74
+
75
+
18
76
  def extract_entities(
19
77
  text: str,
20
78
  entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
@@ -46,11 +104,11 @@ def extract_entities(
46
104
  functionality="Entity Extraction",
47
105
  ) from e
48
106
 
49
- model_name = _select_spacy_model(languages, spacy_config)
107
+ model_name = select_spacy_model(languages, spacy_config)
50
108
  if not model_name:
51
109
  return entities
52
110
 
53
- nlp = _load_spacy_model(model_name, spacy_config)
111
+ nlp = load_spacy_model(model_name, spacy_config)
54
112
 
55
113
  if len(text) > spacy_config.max_doc_length:
56
114
  text = text[: spacy_config.max_doc_length]
@@ -74,7 +132,7 @@ def extract_entities(
74
132
 
75
133
 
76
134
  @lru_cache(maxsize=32)
77
- def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
135
+ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
78
136
  try:
79
137
  import spacy # noqa: PLC0415
80
138
  except ImportError:
@@ -86,22 +144,54 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
86
144
  try:
87
145
  nlp = spacy.load(model_name)
88
146
  except OSError:
89
- result = subprocess.run(
90
- [sys.executable, "-m", "spacy", "download", model_name],
91
- capture_output=True,
92
- text=True,
93
- check=False,
94
- )
95
147
 
96
- if result.returncode != 0:
148
+ async def install_model() -> tuple[bool, str | None]:
149
+ """Install model and return success status and error message."""
150
+ try:
151
+ success = await install_spacy_model_with_spacy(model_name)
152
+ if success:
153
+ return True, None
154
+ except (ImportError, OSError, RuntimeError) as e:
155
+ spacy_error = str(e)
156
+ else:
157
+ spacy_error = "spaCy download failed"
158
+
159
+ if is_uv_available():
160
+ try:
161
+ result = await install_spacy_model_with_uv(model_name)
162
+ return result.returncode == 0, result.stderr
163
+ except (OSError, subprocess.SubprocessError) as e:
164
+ return False, f"spaCy: {spacy_error}, uv: {e!s}"
165
+
166
+ return False, spacy_error
167
+
168
+ try:
169
+ success, error_details = anyio.run(install_model)
170
+ except SystemExit as e:
171
+ success, error_details = False, f"spaCy CLI exit code: {e.code}"
172
+
173
+ if not success:
174
+ if is_uv_available():
175
+ model_url = get_spacy_model_url(model_name)
176
+ manual_install_cmd = f"uv pip install {model_url}"
177
+ else:
178
+ manual_install_cmd = f"python -m spacy download {model_name}"
179
+
97
180
  error_msg = (
98
- f"Failed to download spaCy model '{model_name}'. "
99
- f"Please install it manually with: python -m spacy download {model_name}"
181
+ f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
100
182
  )
101
- if result.stderr:
102
- error_msg += f"\nError details: {result.stderr}"
183
+
184
+ if error_details:
185
+ error_msg += f"\nError details: {error_details}"
186
+
103
187
  raise KreuzbergError(
104
- error_msg, context={"model": model_name, "stderr": result.stderr, "return_code": result.returncode}
188
+ error_msg,
189
+ context={
190
+ "model": model_name,
191
+ "manual_install_cmd": manual_install_cmd,
192
+ "error_details": error_details,
193
+ "uv_available": is_uv_available(),
194
+ },
105
195
  ) from None
106
196
 
107
197
  try:
@@ -118,7 +208,7 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
118
208
  return nlp
119
209
 
120
210
 
121
- def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
211
+ def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
122
212
  if not languages:
123
213
  return spacy_config.get_model_for_language("en")
124
214
 
@@ -140,7 +230,7 @@ def extract_keywords(
140
230
  kw_model = KeyBERT()
141
231
  keywords = kw_model.extract_keywords(text, top_n=keyword_count)
142
232
  return [(kw, float(score)) for kw, score in keywords]
143
- except (RuntimeError, OSError, ValueError):
233
+ except ValueError:
144
234
  return []
145
235
  except ImportError as e: # pragma: no cover
146
236
  raise MissingDependencyError.create_for_package(
@@ -0,0 +1,182 @@
1
+ """Type-safe error handling utilities for extraction pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import traceback
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+
11
+ from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
12
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
13
+
14
+
15
+ def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
16
+ """Determine if an exception should bubble up or be handled gracefully.
17
+
18
+ Args:
19
+ exception: The exception to classify
20
+ context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
21
+
22
+ Returns:
23
+ True if the exception should bubble up, False if it should be handled gracefully
24
+ """
25
+ if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
26
+ return True
27
+
28
+ if isinstance(exception, MissingDependencyError):
29
+ return True
30
+
31
+ if isinstance(exception, ValidationError):
32
+ if context == "batch_processing":
33
+ return False
34
+
35
+ return context != "optional_feature"
36
+
37
+ if isinstance(exception, KreuzbergError) and context == "optional_feature":
38
+ return False
39
+
40
+ if context == "batch_processing":
41
+ return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
42
+
43
+ return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
44
+
45
+
46
+ class FeatureProcessingError:
47
+ """Type-safe processing error for extraction features."""
48
+
49
+ def __init__(self, feature: str, error: Exception) -> None:
50
+ self._feature = feature
51
+ self._error = error
52
+ self._traceback = traceback.format_exc()
53
+
54
+ @property
55
+ def feature(self) -> str:
56
+ return self._feature
57
+
58
+ @property
59
+ def error_type(self) -> str:
60
+ return type(self._error).__name__
61
+
62
+ @property
63
+ def error_message(self) -> str:
64
+ return str(self._error)
65
+
66
+ @property
67
+ def traceback(self) -> str:
68
+ return self._traceback
69
+
70
+ def to_dict(self) -> ProcessingErrorDict:
71
+ return {
72
+ "feature": self.feature,
73
+ "error_type": self.error_type,
74
+ "error_message": self.error_message,
75
+ "traceback": self.traceback,
76
+ }
77
+
78
+
79
+ def safe_feature_execution(
80
+ feature_name: str,
81
+ execution_func: Callable[[], Any],
82
+ default_value: Any,
83
+ result: ExtractionResult,
84
+ context: ErrorContextType = "optional_feature",
85
+ ) -> Any:
86
+ """Safely execute a feature extraction function with proper error handling.
87
+
88
+ Args:
89
+ feature_name: Name of the feature being executed
90
+ execution_func: Function to execute that may raise exceptions
91
+ default_value: Default value to return if execution fails
92
+ result: ExtractionResult to update with error information
93
+ context: The context for exception handling decisions
94
+
95
+ Returns:
96
+ Either the successful result or the default value
97
+ """
98
+ try:
99
+ return execution_func()
100
+ except Exception as e:
101
+ if should_exception_bubble_up(e, context):
102
+ raise
103
+
104
+ _add_processing_error(result, FeatureProcessingError(feature_name, e))
105
+ return default_value
106
+
107
+
108
+ def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
109
+ """Add a processing error to the result metadata in a type-safe way."""
110
+ if result.metadata is None:
111
+ result.metadata = {}
112
+
113
+ if "processing_errors" not in result.metadata:
114
+ result.metadata["processing_errors"] = []
115
+
116
+ errors_list = result.metadata["processing_errors"]
117
+ if isinstance(errors_list, list):
118
+ errors_list.append(error.to_dict())
119
+ else:
120
+ result.metadata["processing_errors"] = [error.to_dict()]
121
+
122
+
123
+ def preserve_result_with_errors(
124
+ result: ExtractionResult,
125
+ errors: list[FeatureProcessingError],
126
+ ) -> ExtractionResult:
127
+ """Preserve a successful extraction result while adding error information.
128
+
129
+ This is used when core extraction succeeds but optional features fail.
130
+
131
+ Args:
132
+ result: The successful extraction result
133
+ errors: List of errors that occurred during optional processing
134
+
135
+ Returns:
136
+ The result with error information added to metadata
137
+ """
138
+ for error in errors:
139
+ _add_processing_error(result, error)
140
+
141
+ return result
142
+
143
+
144
+ def create_error_result(
145
+ content: str,
146
+ mime_type: str,
147
+ errors: list[FeatureProcessingError],
148
+ **metadata_kwargs: Any,
149
+ ) -> ExtractionResult:
150
+ """Create an error result with proper type safety.
151
+
152
+ Args:
153
+ content: Error content to include
154
+ mime_type: MIME type of the result
155
+ errors: List of errors that occurred
156
+ **metadata_kwargs: Additional metadata to include
157
+
158
+ Returns:
159
+ An ExtractionResult with error information
160
+ """
161
+ metadata: Metadata = {
162
+ "error": f"Multiple processing errors occurred: {len(errors)} errors",
163
+ "error_context": {
164
+ "error_count": len(errors),
165
+ "errors": [error.to_dict() for error in errors],
166
+ **metadata_kwargs,
167
+ },
168
+ "processing_errors": [error.to_dict() for error in errors],
169
+ }
170
+
171
+ return ExtractionResult(
172
+ content=content,
173
+ chunks=[],
174
+ mime_type=mime_type,
175
+ metadata=metadata,
176
+ entities=[],
177
+ keywords=[],
178
+ detected_languages=[],
179
+ tables=[],
180
+ images=[],
181
+ image_ocr_results=[],
182
+ )
@@ -230,13 +230,13 @@ class Extractor(ABC):
230
230
  confidence_score=None,
231
231
  processing_time=duration,
232
232
  )
233
- except (OSError, ValueError) as e: # pragma: no cover
233
+ except ValueError as e: # pragma: no cover
234
234
  return ImageOCRResult(
235
235
  image=target,
236
236
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
237
237
  skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
238
238
  )
239
- except (RuntimeError, TypeError) as e: # pragma: no cover
239
+ except TypeError as e: # pragma: no cover
240
240
  return ImageOCRResult(
241
241
  image=target,
242
242
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),