kreuzberg 3.17.3__tar.gz → 3.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.pre-commit-config.yaml +1 -1
  2. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/PKG-INFO +4 -4
  3. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/api-server.md +32 -1
  4. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_api/main.py +43 -3
  5. kreuzberg-3.18.0/kreuzberg/_entity_extraction.py +244 -0
  6. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/pyproject.toml +5 -5
  7. kreuzberg-3.18.0/tests/api/environment_config_test.py +154 -0
  8. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pdf_test.py +74 -0
  9. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/entity_extraction_test.py +32 -157
  10. kreuzberg-3.18.0/tests/test_source_files/image-only-german-pdf.pdf +0 -0
  11. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/uv.lock +165 -141
  12. kreuzberg-3.17.3/kreuzberg/_entity_extraction.py +0 -150
  13. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.commitlintrc +0 -0
  14. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.deepsource.toml +0 -0
  15. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.docker/Dockerfile +0 -0
  16. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.docker/README.md +0 -0
  17. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.dockerignore +0 -0
  18. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/dependabot.yaml +0 -0
  19. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/ci.yaml +0 -0
  20. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/docker-e2e-tests.yml +0 -0
  21. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/docs.yml +0 -0
  22. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/pr-title.yaml +0 -0
  23. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/publish-docker.yml +0 -0
  24. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/release.yaml +0 -0
  25. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/test-docker-builds.yml +0 -0
  26. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.gitignore +0 -0
  27. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.markdownlint.yaml +0 -0
  28. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.prettierignore +0 -0
  29. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/ATTRIBUTIONS.md +0 -0
  30. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/LICENSE +0 -0
  31. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/README.md +0 -0
  32. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/Taskfile.yml +0 -0
  33. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/ai-rulez.yaml +0 -0
  34. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/README.md +0 -0
  35. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/__init__.py +0 -0
  36. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/batch_size_benchmark.py +0 -0
  37. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/batch_validation_benchmark.py +0 -0
  38. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/py.typed +0 -0
  39. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/pyproject.toml +0 -0
  40. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/__init__.py +0 -0
  41. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/__main__.py +0 -0
  42. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/benchmarks.py +0 -0
  43. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/cli.py +0 -0
  44. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/models.py +0 -0
  45. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/profiler.py +0 -0
  46. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/runner.py +0 -0
  47. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/token_reduction_compression_benchmark.py +0 -0
  48. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/custom-extractors.md +0 -0
  49. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/custom-hooks.md +0 -0
  50. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/error-handling.md +0 -0
  51. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/index.md +0 -0
  52. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/performance.md +0 -0
  53. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/exceptions.md +0 -0
  54. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/extraction-functions.md +0 -0
  55. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/extractor-registry.md +0 -0
  56. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/index.md +0 -0
  57. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/ocr-configuration.md +0 -0
  58. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/types.md +0 -0
  59. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/assets/favicon.png +0 -0
  60. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/assets/logo.png +0 -0
  61. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/cli.md +0 -0
  62. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/contributing.md +0 -0
  63. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/css/extra.css +0 -0
  64. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/examples/extraction-examples.md +0 -0
  65. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/examples/index.md +0 -0
  66. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/getting-started/index.md +0 -0
  67. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/getting-started/installation.md +0 -0
  68. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/getting-started/quick-start.md +0 -0
  69. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/index.md +0 -0
  70. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/basic-usage.md +0 -0
  71. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/chunking.md +0 -0
  72. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/docker.md +0 -0
  73. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/document-classification.md +0 -0
  74. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/extraction-configuration.md +0 -0
  75. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/index.md +0 -0
  76. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/mcp-server.md +0 -0
  77. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/metadata-extraction.md +0 -0
  78. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/ocr-backends.md +0 -0
  79. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/ocr-configuration.md +0 -0
  80. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/supported-formats.md +0 -0
  81. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/token-reduction.md +0 -0
  82. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/__init__.py +0 -0
  83. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/__main__.py +0 -0
  84. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_api/__init__.py +0 -0
  85. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_api/_config_cache.py +0 -0
  86. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_chunker.py +0 -0
  87. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_config.py +0 -0
  88. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_constants.py +0 -0
  89. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_document_classification.py +0 -0
  90. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/__init__.py +0 -0
  91. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_base.py +0 -0
  92. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_email.py +0 -0
  93. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_html.py +0 -0
  94. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_image.py +0 -0
  95. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  96. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_pdf.py +0 -0
  97. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_presentation.py +0 -0
  98. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  99. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_structured.py +0 -0
  100. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_gmft.py +0 -0
  101. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_language_detection.py +0 -0
  102. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_mcp/__init__.py +0 -0
  103. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_mcp/server.py +0 -0
  104. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_mime_types.py +0 -0
  105. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/__init__.py +0 -0
  106. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_base.py +0 -0
  107. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_easyocr.py +0 -0
  108. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  109. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
  110. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_tesseract.py +0 -0
  111. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_playa.py +0 -0
  112. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_registry.py +0 -0
  113. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/__init__.py +0 -0
  114. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/_reducer.py +0 -0
  115. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  116. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  117. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  118. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  119. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  120. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  121. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  122. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  123. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  124. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  125. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  126. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  127. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  128. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  129. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  130. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  131. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  132. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  133. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  134. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  135. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  136. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  137. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  138. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  139. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  140. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  141. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  142. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  143. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  144. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  145. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  146. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  147. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  148. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  149. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  150. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  151. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  152. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  153. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  154. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  155. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  156. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  157. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  158. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  159. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  160. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  161. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  162. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  163. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  164. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  165. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  166. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  167. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  168. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  169. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  170. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  171. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  172. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  173. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  174. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  175. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  176. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  177. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  178. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  179. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  180. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_types.py +0 -0
  181. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/__init__.py +0 -0
  182. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_cache.py +0 -0
  183. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_device.py +0 -0
  184. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_document_cache.py +0 -0
  185. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_errors.py +0 -0
  186. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_html_streaming.py +0 -0
  187. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  188. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
  189. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  190. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_process_pool.py +0 -0
  191. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_quality.py +0 -0
  192. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_ref.py +0 -0
  193. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_resource_managers.py +0 -0
  194. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_serialization.py +0 -0
  195. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_string.py +0 -0
  196. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_sync.py +0 -0
  197. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_table.py +0 -0
  198. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_tmp.py +0 -0
  199. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/cli.py +0 -0
  200. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/exceptions.py +0 -0
  201. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/extraction.py +0 -0
  202. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/py.typed +0 -0
  203. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/mkdocs.yaml +0 -0
  204. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/__init__.py +0 -0
  205. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/__init__.py +0 -0
  206. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/config_cache_test.py +0 -0
  207. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/conftest.py +0 -0
  208. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/header_config_hashing_test.py +0 -0
  209. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/image_extraction_test.py +0 -0
  210. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/main_test.py +0 -0
  211. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/runtime_config_test.py +0 -0
  212. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/conftest.py +0 -0
  213. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/__init__.py +0 -0
  214. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/comprehensive_config_test.py +0 -0
  215. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/config_test.py +0 -0
  216. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/constants_test.py +0 -0
  217. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/dpi_configuration_test.py +0 -0
  218. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/exceptions_test.py +0 -0
  219. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/extraction_batch_test.py +0 -0
  220. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/extraction_test.py +0 -0
  221. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/html_to_markdown_config_test.py +0 -0
  222. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/image_ocr_result_test.py +0 -0
  223. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/init_test.py +0 -0
  224. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/main_test.py +0 -0
  225. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/mime_types_test.py +0 -0
  226. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/registry_test.py +0 -0
  227. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/types_test.py +0 -0
  228. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/e2e/__init__.py +0 -0
  229. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/e2e/docker_e2e.py +0 -0
  230. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/README_image_tests.md +0 -0
  231. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/__init__.py +0 -0
  232. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_extractor_test.py +0 -0
  233. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_memory_limits_test.py +0 -0
  234. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_ocr_processing_test.py +0 -0
  235. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_ocr_simple_test.py +0 -0
  236. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/email_error_paths_test.py +0 -0
  237. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/email_test.py +0 -0
  238. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/html_invalid_base64_test.py +0 -0
  239. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/html_test.py +0 -0
  240. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_deduplication_test.py +0 -0
  241. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_error_handling_test.py +0 -0
  242. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_error_simple_test.py +0 -0
  243. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_test.py +0 -0
  244. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/json_test.py +0 -0
  245. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  246. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pandoc_test.py +0 -0
  247. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pdf_images_test.py +0 -0
  248. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pdf_sync_images_test.py +0 -0
  249. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/presentation_test.py +0 -0
  250. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/spreadsheet_test.py +0 -0
  251. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/structured_test.py +0 -0
  252. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/__init__.py +0 -0
  253. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/chunker_test.py +0 -0
  254. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/document_classification_test.py +0 -0
  255. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/gmft_test.py +0 -0
  256. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/hooks_test.py +0 -0
  257. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/language_detection_test.py +0 -0
  258. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/table_extraction_test.py +0 -0
  259. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/token_reduction_test.py +0 -0
  260. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/__init__.py +0 -0
  261. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/all_extractors_images_test.py +0 -0
  262. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/api/__init__.py +0 -0
  263. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/api/large_file_test.py +0 -0
  264. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/api/mounted_config_test.py +0 -0
  265. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/dpi_integration_test.py +0 -0
  266. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/multiprocessing/__init__.py +0 -0
  267. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  268. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/__init__.py +0 -0
  269. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/device_integration_test.py +0 -0
  270. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  271. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  272. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pandoc_images_test.py +0 -0
  273. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pdf_images_test.py +0 -0
  274. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pdf_real_images_test.py +0 -0
  275. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pptx_complex_test.py +0 -0
  276. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pptx_images_test.py +0 -0
  277. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/regression_test.py +0 -0
  278. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/token_reduction_integration_test.py +0 -0
  279. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/interfaces/__init__.py +0 -0
  280. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/interfaces/cli_test.py +0 -0
  281. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/interfaces/mcp_server_test.py +0 -0
  282. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/mcp/__init__.py +0 -0
  283. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/mcp/mcp_server_test.py +0 -0
  284. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/__init__.py +0 -0
  285. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  286. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/process_manager_test.py +0 -0
  287. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  288. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/__init__.py +0 -0
  289. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/base_test.py +0 -0
  290. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/easyocr_test.py +0 -0
  291. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/init_test.py +0 -0
  292. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/paddleocr_test.py +0 -0
  293. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/tesseract_test.py +0 -0
  294. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/tesseract_tsv_test.py +0 -0
  295. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/performance/__init__.py +0 -0
  296. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/performance/large_pdf_perf_test.py +0 -0
  297. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  298. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/contract.txt +0 -0
  299. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/contract_test.txt +0 -0
  300. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/document.docx +0 -0
  301. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/email/sample-email.eml +0 -0
  302. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  303. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/excel.xlsx +0 -0
  304. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/flower-no-text.jpg +0 -0
  305. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/form_test.txt +0 -0
  306. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/french-text.txt +0 -0
  307. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/german-text.txt +0 -0
  308. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/google-doc-document.pdf +0 -0
  309. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/html.html +0 -0
  310. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  311. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/invoice_image.png +0 -0
  312. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/invoice_test.txt +0 -0
  313. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/complex_nested.json +0 -0
  314. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
  315. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
  316. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
  317. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/iss_location.json +0 -0
  318. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
  319. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/package.json +0 -0
  320. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
  321. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/sample-document.json +0 -0
  322. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/schema_test.json +0 -0
  323. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  324. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/markdown.md +0 -0
  325. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  326. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/non-searchable.pdf +0 -0
  327. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/ocr-image.jpg +0 -0
  328. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  329. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  330. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  331. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  332. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/receipt_test.txt +0 -0
  333. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/report_test.txt +0 -0
  334. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/sample-contract.pdf +0 -0
  335. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/scanned.pdf +0 -0
  336. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/searchable.pdf +0 -0
  337. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
  338. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/spanish-text.txt +0 -0
  339. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/tables/borderless_table.png +0 -0
  340. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/tables/complex_document.png +0 -0
  341. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/tables/simple_table.png +0 -0
  342. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/test-article.pdf +0 -0
  343. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/test-excel.xls +0 -0
  344. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  345. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/__init__.py +0 -0
  346. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/cache_test.py +0 -0
  347. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/device_test.py +0 -0
  348. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/errors_test.py +0 -0
  349. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/ocr_cache_test.py +0 -0
  350. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/pdf_lock_test.py +0 -0
  351. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/playa_helpers_test.py +0 -0
  352. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/playa_metadata_test.py +0 -0
  353. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/playa_test.py +0 -0
  354. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/process_pool_test.py +0 -0
  355. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/quality_test.py +0 -0
  356. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/ref_test.py +0 -0
  357. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/serialization_test.py +0 -0
  358. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/string_test.py +0 -0
  359. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/sync_test.py +0 -0
  360. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/table_test.py +0 -0
  361. {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/tmp_test.py +0 -0
@@ -49,7 +49,7 @@ repos:
49
49
  hooks:
50
50
  - id: pyproject-fmt
51
51
  - repo: https://github.com/astral-sh/ruff-pre-commit
52
- rev: v0.13.1
52
+ rev: v0.13.2
53
53
  hooks:
54
54
  - id: ruff
55
55
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.3
3
+ Version: 3.18.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.10.0
31
+ Requires-Dist: anyio>=4.11.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.14.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.16.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.1
36
+ Requires-Dist: mcp>=1.15.0
37
37
  Requires-Dist: msgspec>=0.18.0
38
38
  Requires-Dist: numpy>=2.0.0
39
39
  Requires-Dist: playa-pdf>=0.7.0
@@ -62,7 +62,7 @@ Extract text from one or more files.
62
62
  - Method: `POST`
63
63
  - Content-Type: `multipart/form-data`
64
64
  - Body: One or more files with field name `data`
65
- - **Maximum file size: 1GB per file**
65
+ - **Maximum file size: Configurable via `KREUZBERG_MAX_UPLOAD_SIZE` environment variable (default: 1GB per file)**
66
66
 
67
67
  **Response:**
68
68
 
@@ -463,6 +463,37 @@ The API server uses the default Kreuzberg extraction configuration:
463
463
  - PDF, image, and document extraction is supported
464
464
  - Table extraction with GMFT (if installed)
465
465
 
466
+ ### Environment Variables
467
+
468
+ The API server can be configured using environment variables for production deployments:
469
+
470
+ #### Server Configuration
471
+
472
+ | Variable | Description | Default | Example |
473
+ | -------------------------------- | ---------------------------- | ------------------ | ------------------ |
474
+ | `KREUZBERG_MAX_UPLOAD_SIZE` | Maximum upload size in bytes | `1073741824` (1GB) | `2147483648` (2GB) |
475
+ | `KREUZBERG_ENABLE_OPENTELEMETRY` | Enable OpenTelemetry tracing | `true` | `false` |
476
+
477
+ #### Usage Examples
478
+
479
+ ```bash
480
+ # Set 2GB upload limit
481
+ export KREUZBERG_MAX_UPLOAD_SIZE=2147483648
482
+ litestar --app kreuzberg._api.main:app run
483
+
484
+ # Disable telemetry
485
+ export KREUZBERG_ENABLE_OPENTELEMETRY=false
486
+ uvicorn kreuzberg._api.main:app --host 0.0.0.0 --port 8000
487
+
488
+ # Production settings with Docker
489
+ docker run -p 8000:8000 \
490
+ -e KREUZBERG_MAX_UPLOAD_SIZE=5368709120 \
491
+ -e KREUZBERG_ENABLE_OPENTELEMETRY=true \
492
+ goldziher/kreuzberg:latest
493
+ ```
494
+
495
+ **Note**: Boolean environment variables accept `true`/`false`, `1`/`0`, `yes`/`no`, or `on`/`off` values.
496
+
466
497
  To use custom configuration, modify the extraction call in your own API wrapper:
467
498
 
468
499
  ```python
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import base64
4
4
  import io
5
+ import os
5
6
  import traceback
6
7
  from json import dumps
7
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -100,6 +101,36 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
100
101
  )
101
102
 
102
103
 
104
+ def _get_max_upload_size() -> int:
105
+ """Get the maximum upload size from environment variable.
106
+
107
+ Returns:
108
+ Maximum upload size in bytes. Defaults to 1GB if not set.
109
+
110
+ Environment Variables:
111
+ KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
+ """
113
+ default_size = 1024 * 1024 * 1024 # 1GB
114
+ try:
115
+ size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
+ # Return default if negative
117
+ return size if size >= 0 else default_size
118
+ except ValueError:
119
+ return default_size
120
+
121
+
122
+ def _is_opentelemetry_enabled() -> bool:
123
+ """Check if OpenTelemetry should be enabled.
124
+
125
+ Returns:
126
+ True if OpenTelemetry should be enabled, False otherwise.
127
+
128
+ Environment Variables:
129
+ KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
130
+ """
131
+ return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
132
+
133
+
103
134
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
104
135
  error_type = type(exception).__name__
105
136
  error_message = str(exception)
@@ -242,7 +273,7 @@ async def handle_files_upload( # noqa: PLR0913
242
273
  - Language detection (if enabled)
243
274
 
244
275
  Supports various file formats including PDF, Office documents, images, and more.
245
- Maximum file size: 1GB per file.
276
+ Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
246
277
 
247
278
  Args:
248
279
  request: The HTTP request object
@@ -379,9 +410,18 @@ type_encoders = {
379
410
  Image.Image: _pil_image_encoder,
380
411
  }
381
412
 
413
+
414
+ def _get_plugins() -> list[Any]:
415
+ """Get configured plugins based on environment variables."""
416
+ plugins = []
417
+ if _is_opentelemetry_enabled():
418
+ plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
419
+ return plugins
420
+
421
+
382
422
  app = Litestar(
383
423
  route_handlers=[handle_files_upload, health_check, get_configuration],
384
- plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
424
+ plugins=_get_plugins(),
385
425
  logging_config=StructLoggingConfig(),
386
426
  openapi_config=openapi_config,
387
427
  exception_handlers={
@@ -389,5 +429,5 @@ app = Litestar(
389
429
  Exception: general_exception_handler,
390
430
  },
391
431
  type_encoders=type_encoders,
392
- request_max_body_size=1024 * 1024 * 1024,
432
+ request_max_body_size=_get_max_upload_size(),
393
433
  )
@@ -0,0 +1,244 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+ from functools import lru_cache
8
+ from itertools import chain
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ import anyio
12
+
13
+ from kreuzberg._types import Entity, SpacyEntityExtractionConfig
14
+ from kreuzberg._utils._sync import run_sync
15
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Sequence
19
+
20
+
21
+ def is_uv_available() -> bool:
22
+ """Check if uv is available in the environment."""
23
+ return shutil.which("uv") is not None
24
+
25
+
26
+ def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
27
+ """Get the direct download URL for a spaCy model.
28
+
29
+ Args:
30
+ model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
31
+ version: Model version to download (default: 3.8.0)
32
+
33
+ Returns:
34
+ Direct download URL for the model
35
+ """
36
+ return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
37
+
38
+
39
+ async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
40
+ """Install spaCy model using uv.
41
+
42
+ Args:
43
+ model_name: Name of the spaCy model to install
44
+
45
+ Returns:
46
+ Completed process result
47
+ """
48
+ model_url = get_spacy_model_url(model_name)
49
+ return await run_sync(
50
+ subprocess.run,
51
+ ["uv", "pip", "install", model_url],
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ )
56
+
57
+
58
+ async def install_spacy_model_with_spacy(model_name: str) -> bool:
59
+ """Install spaCy model using spacy download function.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to install
63
+
64
+ Returns:
65
+ True if successful, False otherwise
66
+ """
67
+ try:
68
+ import spacy.cli.download # noqa: PLC0415
69
+
70
+ await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
71
+ return True
72
+ except (ImportError, OSError, RuntimeError):
73
+ return False
74
+
75
+
76
+ def extract_entities(
77
+ text: str,
78
+ entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
79
+ custom_patterns: frozenset[tuple[str, str]] | None = None,
80
+ languages: list[str] | None = None,
81
+ spacy_config: SpacyEntityExtractionConfig | None = None,
82
+ ) -> list[Entity]:
83
+ entities: list[Entity] = []
84
+ if custom_patterns:
85
+ entities.extend(
86
+ chain.from_iterable(
87
+ (
88
+ Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
89
+ for match in re.finditer(pattern, text)
90
+ )
91
+ for ent_type, pattern in custom_patterns
92
+ )
93
+ )
94
+
95
+ if spacy_config is None:
96
+ spacy_config = SpacyEntityExtractionConfig()
97
+
98
+ try:
99
+ import spacy # noqa: F401, PLC0415
100
+ except ImportError as e: # pragma: no cover
101
+ raise MissingDependencyError.create_for_package(
102
+ package_name="spacy",
103
+ dependency_group="entity-extraction",
104
+ functionality="Entity Extraction",
105
+ ) from e
106
+
107
+ model_name = select_spacy_model(languages, spacy_config)
108
+ if not model_name:
109
+ return entities
110
+
111
+ nlp = load_spacy_model(model_name, spacy_config)
112
+
113
+ if len(text) > spacy_config.max_doc_length:
114
+ text = text[: spacy_config.max_doc_length]
115
+
116
+ doc = nlp(text)
117
+
118
+ entity_type_mapping = {etype.upper() for etype in entity_types}
119
+
120
+ entities.extend(
121
+ Entity(
122
+ type=ent.label_,
123
+ text=ent.text,
124
+ start=ent.start_char,
125
+ end=ent.end_char,
126
+ )
127
+ for ent in doc.ents
128
+ if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
129
+ )
130
+
131
+ return entities
132
+
133
+
134
+ @lru_cache(maxsize=32)
135
+ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
136
+ try:
137
+ import spacy # noqa: PLC0415
138
+ except ImportError:
139
+ return None
140
+
141
+ if spacy_config.model_cache_dir:
142
+ os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
143
+
144
+ try:
145
+ nlp = spacy.load(model_name)
146
+ except OSError:
147
+ # Try to download the model automatically
148
+ async def install_model() -> tuple[bool, str | None]:
149
+ """Install model and return success status and error message."""
150
+ # First try spaCy's built-in download
151
+ try:
152
+ success = await install_spacy_model_with_spacy(model_name)
153
+ if success:
154
+ return True, None
155
+ except (ImportError, OSError, RuntimeError) as e:
156
+ spacy_error = str(e)
157
+ else:
158
+ spacy_error = "spaCy download failed"
159
+
160
+ # If spaCy download failed and uv is available, try uv as fallback
161
+ if is_uv_available():
162
+ try:
163
+ result = await install_spacy_model_with_uv(model_name)
164
+ return result.returncode == 0, result.stderr
165
+ except (OSError, subprocess.SubprocessError) as e:
166
+ return False, f"spaCy: {spacy_error}, uv: {e!s}"
167
+
168
+ return False, spacy_error
169
+
170
+ # Run the async installation in a sync context
171
+ try:
172
+ success, error_details = anyio.run(install_model)
173
+ except (OSError, RuntimeError) as e:
174
+ success, error_details = False, str(e)
175
+
176
+ if not success:
177
+ # Generate appropriate error message based on available tools
178
+ if is_uv_available():
179
+ model_url = get_spacy_model_url(model_name)
180
+ manual_install_cmd = f"uv pip install {model_url}"
181
+ else:
182
+ manual_install_cmd = f"python -m spacy download {model_name}"
183
+
184
+ error_msg = (
185
+ f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
186
+ )
187
+
188
+ if error_details:
189
+ error_msg += f"\nError details: {error_details}"
190
+
191
+ raise KreuzbergError(
192
+ error_msg,
193
+ context={
194
+ "model": model_name,
195
+ "manual_install_cmd": manual_install_cmd,
196
+ "error_details": error_details,
197
+ "uv_available": is_uv_available(),
198
+ },
199
+ ) from None
200
+
201
+ try:
202
+ nlp = spacy.load(model_name)
203
+ except OSError as e:
204
+ raise KreuzbergError(
205
+ f"Failed to load spaCy model '{model_name}' even after successful download. "
206
+ f"Please verify your spaCy installation and try reinstalling the model.",
207
+ context={"model": model_name, "error": str(e)},
208
+ ) from e
209
+
210
+ nlp.max_length = spacy_config.max_doc_length
211
+
212
+ return nlp
213
+
214
+
215
+ def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
216
+ if not languages:
217
+ return spacy_config.get_model_for_language("en")
218
+
219
+ for lang in languages:
220
+ model_name = spacy_config.get_model_for_language(lang)
221
+ if model_name:
222
+ return model_name
223
+
224
+ return spacy_config.get_fallback_model()
225
+
226
+
227
+ def extract_keywords(
228
+ text: str,
229
+ keyword_count: int = 10,
230
+ ) -> list[tuple[str, float]]:
231
+ try:
232
+ from keybert import KeyBERT # noqa: PLC0415
233
+
234
+ kw_model = KeyBERT()
235
+ keywords = kw_model.extract_keywords(text, top_n=keyword_count)
236
+ return [(kw, float(score)) for kw, score in keywords]
237
+ except (RuntimeError, OSError, ValueError):
238
+ return []
239
+ except ImportError as e: # pragma: no cover
240
+ raise MissingDependencyError.create_for_package(
241
+ package_name="keybert",
242
+ dependency_group="entity-extraction",
243
+ functionality="Keyword Extraction",
244
+ ) from e
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
5
5
 
6
6
  [project]
7
7
  name = "kreuzberg"
8
- version = "3.17.3"
8
+ version = "3.18.0"
9
9
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -57,12 +57,12 @@ classifiers = [
57
57
  ]
58
58
 
59
59
  dependencies = [
60
- "anyio>=4.10.0",
60
+ "anyio>=4.11.0",
61
61
  "chardetng-py>=0.3.5",
62
62
  "exceptiongroup>=1.2.2; python_version<'3.11'",
63
- "html-to-markdown[lxml]>=1.14.0",
63
+ "html-to-markdown[lxml]>=1.16.0",
64
64
  "langcodes>=3.5.0",
65
- "mcp>=1.14.1",
65
+ "mcp>=1.15.0",
66
66
  "msgspec>=0.18.0",
67
67
  "numpy>=2.0.0",
68
68
  "playa-pdf>=0.7.0",
@@ -117,7 +117,7 @@ dev = [
117
117
  "pytest-rerunfailures>=16.0.1",
118
118
  "pytest-timeout>=2.4.0",
119
119
  "rich>=14.1.0",
120
- "ruff>=0.13.1",
120
+ "ruff>=0.13.2",
121
121
  "tabulate>=0.9.0",
122
122
  "trio>=0.31.0",
123
123
  "uv-bump",
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import TYPE_CHECKING, Any
5
+ from unittest.mock import patch
6
+
7
+ import pytest
8
+
9
+ if TYPE_CHECKING:
10
+ from litestar.testing import AsyncTestClient
11
+
12
+
13
+ def test_get_max_upload_size_default() -> None:
14
+ from kreuzberg._api.main import _get_max_upload_size
15
+
16
+ with patch.dict(os.environ, {}, clear=True):
17
+ assert _get_max_upload_size() == 1024 * 1024 * 1024 # 1GB
18
+
19
+
20
+ def test_get_max_upload_size_custom() -> None:
21
+ from kreuzberg._api.main import _get_max_upload_size
22
+
23
+ custom_size = 2 * 1024 * 1024 * 1024 # 2GB
24
+ with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": str(custom_size)}):
25
+ assert _get_max_upload_size() == custom_size
26
+
27
+
28
+ def test_get_max_upload_size_invalid_value() -> None:
29
+ from kreuzberg._api.main import _get_max_upload_size
30
+
31
+ with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": "invalid"}):
32
+ assert _get_max_upload_size() == 1024 * 1024 * 1024 # Falls back to default
33
+
34
+
35
+ def test_is_opentelemetry_enabled_default() -> None:
36
+ from kreuzberg._api.main import _is_opentelemetry_enabled
37
+
38
+ with patch.dict(os.environ, {}, clear=True):
39
+ assert _is_opentelemetry_enabled() is True
40
+
41
+
42
+ def test_is_opentelemetry_enabled_false() -> None:
43
+ from kreuzberg._api.main import _is_opentelemetry_enabled
44
+
45
+ test_cases = ["false", "False", "FALSE", "0", "no", "No", "off", "Off"]
46
+ for value in test_cases:
47
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": value}):
48
+ assert _is_opentelemetry_enabled() is False, f"Failed for value: {value}"
49
+
50
+
51
+ def test_is_opentelemetry_enabled_true() -> None:
52
+ from kreuzberg._api.main import _is_opentelemetry_enabled
53
+
54
+ test_cases = ["true", "True", "TRUE", "1", "yes", "Yes", "on", "On"]
55
+ for value in test_cases:
56
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": value}):
57
+ assert _is_opentelemetry_enabled() is True, f"Failed for value: {value}"
58
+
59
+
60
+ def test_get_plugins_with_opentelemetry_enabled() -> None:
61
+ from kreuzberg._api.main import _get_plugins
62
+
63
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "true"}):
64
+ plugins = _get_plugins()
65
+ assert len(plugins) == 1
66
+ assert type(plugins[0]).__name__ == "OpenTelemetryPlugin"
67
+
68
+
69
+ def test_get_plugins_with_opentelemetry_disabled() -> None:
70
+ from kreuzberg._api.main import _get_plugins
71
+
72
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "false"}):
73
+ plugins = _get_plugins()
74
+ assert len(plugins) == 0
75
+
76
+
77
+ @pytest.mark.anyio
78
+ async def test_app_configuration_with_custom_upload_size() -> None:
79
+ """Test that the Litestar app uses the configured upload size"""
80
+ from kreuzberg._api.main import _get_max_upload_size
81
+
82
+ custom_size = 512 * 1024 * 1024 # 512MB
83
+
84
+ with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": str(custom_size)}):
85
+ assert _get_max_upload_size() == custom_size
86
+
87
+
88
+ @pytest.mark.anyio
89
+ async def test_large_file_upload_respected(test_client: AsyncTestClient[Any], tmp_path: Any) -> None:
90
+ """Test that large file upload limits are respected"""
91
+
92
+ # Create a test file that would exceed a small upload limit
93
+ test_file = tmp_path / "large_test.txt"
94
+ large_content = "x" * (2 * 1024 * 1024) # 2MB content
95
+ test_file.write_text(large_content)
96
+
97
+ # Test with original app (should work with default 1GB limit)
98
+ with test_file.open("rb") as f:
99
+ response = await test_client.post("/extract", files=[("data", (test_file.name, f.read(), "text/plain"))])
100
+
101
+ # Should succeed with default 1GB limit
102
+ assert response.status_code == 201
103
+
104
+
105
+ def test_environment_variable_combinations() -> None:
106
+ """Test various combinations of environment variables"""
107
+ from kreuzberg._api.main import _get_max_upload_size, _is_opentelemetry_enabled
108
+
109
+ test_env = {
110
+ "KREUZBERG_MAX_UPLOAD_SIZE": "5368709120", # 5GB
111
+ "KREUZBERG_ENABLE_OPENTELEMETRY": "false",
112
+ }
113
+
114
+ with patch.dict(os.environ, test_env):
115
+ assert _get_max_upload_size() == 5368709120
116
+ assert _is_opentelemetry_enabled() is False
117
+
118
+
119
+ def test_edge_cases_for_upload_size() -> None:
120
+ """Test edge cases for upload size configuration"""
121
+ from kreuzberg._api.main import _get_max_upload_size
122
+
123
+ # Test zero
124
+ with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": "0"}):
125
+ assert _get_max_upload_size() == 0
126
+
127
+ # Test very large number
128
+ large_size = str(10 * 1024 * 1024 * 1024) # 10GB
129
+ with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": large_size}):
130
+ assert _get_max_upload_size() == int(large_size)
131
+
132
+ # Test negative number (should fall back to default)
133
+ with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": "-1"}):
134
+ assert _get_max_upload_size() == 1024 * 1024 * 1024
135
+
136
+
137
+ def test_edge_cases_for_opentelemetry() -> None:
138
+ """Test edge cases for OpenTelemetry boolean configuration"""
139
+ from kreuzberg._api.main import _is_opentelemetry_enabled
140
+
141
+ # Test empty string (should default to true)
142
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": ""}):
143
+ assert _is_opentelemetry_enabled() is False
144
+
145
+ # Test random string (should default to false)
146
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "random"}):
147
+ assert _is_opentelemetry_enabled() is False
148
+
149
+ # Test numeric strings
150
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "2"}):
151
+ assert _is_opentelemetry_enabled() is False
152
+
153
+ with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "1"}):
154
+ assert _is_opentelemetry_enabled() is True
@@ -903,3 +903,77 @@ async def test_pdf_extract_path_async_table_import_error(
903
903
 
904
904
  assert result.content == "Text content"
905
905
  assert result.tables == []
906
+
907
+
908
+ @pytest.fixture
909
+ def german_image_pdf() -> Path:
910
+ """Path to German image-only PDF that previously caused EmptyHtmlError."""
911
+ return Path(__file__).parent.parent / "test_source_files" / "image-only-german-pdf.pdf"
912
+
913
+
914
+ @pytest.mark.anyio
915
+ async def test_extract_german_image_pdf_async_with_force_ocr(german_image_pdf: Path) -> None:
916
+ """Test that German image-only PDF extracts successfully with force_ocr=True.
917
+
918
+ This test reproduces issue #149 where an image-only German PDF would fail
919
+ with EmptyHtmlError when using Tesseract OCR.
920
+ """
921
+ from kreuzberg._types import PSMMode, TesseractConfig
922
+
923
+ config = ExtractionConfig(
924
+ force_ocr=True, ocr_backend="tesseract", ocr_config=TesseractConfig(language="deu", psm=PSMMode.SINGLE_BLOCK)
925
+ )
926
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
927
+
928
+ result = await extractor.extract_path_async(german_image_pdf)
929
+
930
+ # Should extract German text successfully
931
+ assert result.content.strip(), "Should extract text content from German image PDF"
932
+ assert result.mime_type == "text/plain"
933
+ assert len(result.content) > 50, "Should extract meaningful amount of text"
934
+
935
+
936
+ def test_extract_german_image_pdf_sync_with_force_ocr(german_image_pdf: Path) -> None:
937
+ """Test that German image-only PDF extracts successfully with force_ocr=True (sync).
938
+
939
+ This test reproduces issue #149 where an image-only German PDF would fail
940
+ with EmptyHtmlError when using Tesseract OCR.
941
+ """
942
+ from kreuzberg._types import PSMMode, TesseractConfig
943
+
944
+ config = ExtractionConfig(
945
+ force_ocr=True, ocr_backend="tesseract", ocr_config=TesseractConfig(language="deu", psm=PSMMode.SINGLE_BLOCK)
946
+ )
947
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
948
+
949
+ result = extractor.extract_path_sync(german_image_pdf)
950
+
951
+ # Should extract German text successfully
952
+ assert result.content.strip(), "Should extract text content from German image PDF"
953
+ assert result.mime_type == "text/plain"
954
+ assert len(result.content) > 50, "Should extract meaningful amount of text"
955
+
956
+
957
+ @pytest.mark.anyio
958
+ async def test_extract_german_image_pdf_async_default_config(german_image_pdf: Path) -> None:
959
+ """Test that German image-only PDF extracts with default OCR config."""
960
+ config = ExtractionConfig(ocr_backend="tesseract")
961
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
962
+
963
+ result = await extractor.extract_path_async(german_image_pdf)
964
+
965
+ # Should extract some text content even with default config
966
+ assert result.content.strip(), "Should extract text content with default config"
967
+ assert result.mime_type == "text/plain"
968
+
969
+
970
+ def test_extract_german_image_pdf_sync_default_config(german_image_pdf: Path) -> None:
971
+ """Test that German image-only PDF extracts with default OCR config (sync)."""
972
+ config = ExtractionConfig(ocr_backend="tesseract")
973
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
974
+
975
+ result = extractor.extract_path_sync(german_image_pdf)
976
+
977
+ # Should extract some text content even with default config
978
+ assert result.content.strip(), "Should extract text content with default config"
979
+ assert result.mime_type == "text/plain"