kreuzberg 3.17.2__tar.gz → 3.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/ci.yaml +15 -9
  2. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.pre-commit-config.yaml +1 -1
  3. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/PKG-INFO +4 -4
  4. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/Taskfile.yml +3 -3
  5. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/ai-rulez.yaml +4 -3
  6. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/contributing.md +9 -5
  7. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/api-server.md +32 -1
  8. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_api/main.py +43 -3
  9. kreuzberg-3.18.0/kreuzberg/_entity_extraction.py +244 -0
  10. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_language_detection.py +0 -2
  11. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/extraction.py +0 -1
  12. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/pyproject.toml +5 -6
  13. kreuzberg-3.18.0/tests/api/environment_config_test.py +154 -0
  14. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/pdf_test.py +74 -0
  15. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/entity_extraction_test.py +36 -105
  16. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/language_detection_test.py +1 -12
  17. kreuzberg-3.18.0/tests/test_source_files/image-only-german-pdf.pdf +0 -0
  18. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/uv.lock +164 -209
  19. kreuzberg-3.17.2/kreuzberg/_entity_extraction.py +0 -122
  20. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.commitlintrc +0 -0
  21. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.deepsource.toml +0 -0
  22. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.docker/Dockerfile +0 -0
  23. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.docker/README.md +0 -0
  24. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.dockerignore +0 -0
  25. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/dependabot.yaml +0 -0
  26. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/docker-e2e-tests.yml +0 -0
  27. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/docs.yml +0 -0
  28. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/pr-title.yaml +0 -0
  29. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/publish-docker.yml +0 -0
  30. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/release.yaml +0 -0
  31. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.github/workflows/test-docker-builds.yml +0 -0
  32. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.gitignore +0 -0
  33. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.markdownlint.yaml +0 -0
  34. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/.prettierignore +0 -0
  35. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/ATTRIBUTIONS.md +0 -0
  36. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/LICENSE +0 -0
  37. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/README.md +0 -0
  38. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/README.md +0 -0
  39. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/__init__.py +0 -0
  40. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/batch_size_benchmark.py +0 -0
  41. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/batch_validation_benchmark.py +0 -0
  42. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/py.typed +0 -0
  43. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/pyproject.toml +0 -0
  44. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/__init__.py +0 -0
  45. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/__main__.py +0 -0
  46. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/benchmarks.py +0 -0
  47. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/cli.py +0 -0
  48. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/models.py +0 -0
  49. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/profiler.py +0 -0
  50. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/src/runner.py +0 -0
  51. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/benchmarks/token_reduction_compression_benchmark.py +0 -0
  52. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/advanced/custom-extractors.md +0 -0
  53. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/advanced/custom-hooks.md +0 -0
  54. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/advanced/error-handling.md +0 -0
  55. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/advanced/index.md +0 -0
  56. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/advanced/performance.md +0 -0
  57. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/api-reference/exceptions.md +0 -0
  58. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/api-reference/extraction-functions.md +0 -0
  59. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/api-reference/extractor-registry.md +0 -0
  60. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/api-reference/index.md +0 -0
  61. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/api-reference/ocr-configuration.md +0 -0
  62. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/api-reference/types.md +0 -0
  63. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/assets/favicon.png +0 -0
  64. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/assets/logo.png +0 -0
  65. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/cli.md +0 -0
  66. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/css/extra.css +0 -0
  67. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/examples/extraction-examples.md +0 -0
  68. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/examples/index.md +0 -0
  69. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/getting-started/index.md +0 -0
  70. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/getting-started/installation.md +0 -0
  71. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/getting-started/quick-start.md +0 -0
  72. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/index.md +0 -0
  73. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/basic-usage.md +0 -0
  74. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/chunking.md +0 -0
  75. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/docker.md +0 -0
  76. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/document-classification.md +0 -0
  77. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/extraction-configuration.md +0 -0
  78. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/index.md +0 -0
  79. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/mcp-server.md +0 -0
  80. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/metadata-extraction.md +0 -0
  81. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/ocr-backends.md +0 -0
  82. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/ocr-configuration.md +0 -0
  83. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/supported-formats.md +0 -0
  84. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/docs/user-guide/token-reduction.md +0 -0
  85. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/__init__.py +0 -0
  86. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/__main__.py +0 -0
  87. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_api/__init__.py +0 -0
  88. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_api/_config_cache.py +0 -0
  89. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_chunker.py +0 -0
  90. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_config.py +0 -0
  91. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_constants.py +0 -0
  92. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_document_classification.py +0 -0
  93. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/__init__.py +0 -0
  94. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_base.py +0 -0
  95. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_email.py +0 -0
  96. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_html.py +0 -0
  97. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_image.py +0 -0
  98. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  99. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_pdf.py +0 -0
  100. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_presentation.py +0 -0
  101. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  102. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_extractors/_structured.py +0 -0
  103. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_gmft.py +0 -0
  104. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_mcp/__init__.py +0 -0
  105. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_mcp/server.py +0 -0
  106. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_mime_types.py +0 -0
  107. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_ocr/__init__.py +0 -0
  108. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_ocr/_base.py +0 -0
  109. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_ocr/_easyocr.py +0 -0
  110. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  111. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
  112. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_ocr/_tesseract.py +0 -0
  113. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_playa.py +0 -0
  114. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_registry.py +0 -0
  115. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/__init__.py +0 -0
  116. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/_reducer.py +0 -0
  117. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  118. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  119. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  120. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  121. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  122. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  123. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  124. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  125. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  126. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  127. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  128. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  129. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  130. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  131. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  132. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  133. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  134. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  135. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  136. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  137. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  138. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  139. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  140. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  141. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  142. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  143. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  144. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  145. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  146. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  147. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  148. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  149. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  150. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  151. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  152. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  153. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  154. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  155. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  156. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  157. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  158. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  159. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  160. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  161. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  162. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  163. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  164. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  165. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  166. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  167. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  168. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  169. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  170. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  171. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  172. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  173. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  174. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  175. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  176. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  177. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  178. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  179. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  180. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  181. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  182. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_types.py +0 -0
  183. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/__init__.py +0 -0
  184. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_cache.py +0 -0
  185. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_device.py +0 -0
  186. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_document_cache.py +0 -0
  187. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_errors.py +0 -0
  188. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_html_streaming.py +0 -0
  189. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  190. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
  191. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  192. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_process_pool.py +0 -0
  193. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_quality.py +0 -0
  194. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_ref.py +0 -0
  195. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_resource_managers.py +0 -0
  196. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_serialization.py +0 -0
  197. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_string.py +0 -0
  198. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_sync.py +0 -0
  199. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_table.py +0 -0
  200. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/_utils/_tmp.py +0 -0
  201. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/cli.py +0 -0
  202. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/exceptions.py +0 -0
  203. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/kreuzberg/py.typed +0 -0
  204. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/mkdocs.yaml +0 -0
  205. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/__init__.py +0 -0
  206. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/__init__.py +0 -0
  207. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/config_cache_test.py +0 -0
  208. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/conftest.py +0 -0
  209. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/header_config_hashing_test.py +0 -0
  210. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/image_extraction_test.py +0 -0
  211. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/main_test.py +0 -0
  212. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/api/runtime_config_test.py +0 -0
  213. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/conftest.py +0 -0
  214. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/__init__.py +0 -0
  215. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/comprehensive_config_test.py +0 -0
  216. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/config_test.py +0 -0
  217. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/constants_test.py +0 -0
  218. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/dpi_configuration_test.py +0 -0
  219. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/exceptions_test.py +0 -0
  220. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/extraction_batch_test.py +0 -0
  221. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/extraction_test.py +0 -0
  222. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/html_to_markdown_config_test.py +0 -0
  223. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/image_ocr_result_test.py +0 -0
  224. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/init_test.py +0 -0
  225. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/main_test.py +0 -0
  226. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/mime_types_test.py +0 -0
  227. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/registry_test.py +0 -0
  228. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/core/types_test.py +0 -0
  229. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/e2e/__init__.py +0 -0
  230. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/e2e/docker_e2e.py +0 -0
  231. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/README_image_tests.md +0 -0
  232. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/__init__.py +0 -0
  233. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/base_extractor_test.py +0 -0
  234. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/base_memory_limits_test.py +0 -0
  235. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/base_ocr_processing_test.py +0 -0
  236. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/base_ocr_simple_test.py +0 -0
  237. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/email_error_paths_test.py +0 -0
  238. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/email_test.py +0 -0
  239. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/html_invalid_base64_test.py +0 -0
  240. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/html_test.py +0 -0
  241. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/image_deduplication_test.py +0 -0
  242. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/image_error_handling_test.py +0 -0
  243. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/image_error_simple_test.py +0 -0
  244. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/image_test.py +0 -0
  245. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/json_test.py +0 -0
  246. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  247. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/pandoc_test.py +0 -0
  248. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/pdf_images_test.py +0 -0
  249. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/pdf_sync_images_test.py +0 -0
  250. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/presentation_test.py +0 -0
  251. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/spreadsheet_test.py +0 -0
  252. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/extractors/structured_test.py +0 -0
  253. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/__init__.py +0 -0
  254. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/chunker_test.py +0 -0
  255. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/document_classification_test.py +0 -0
  256. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/gmft_test.py +0 -0
  257. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/hooks_test.py +0 -0
  258. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/table_extraction_test.py +0 -0
  259. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/features/token_reduction_test.py +0 -0
  260. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/__init__.py +0 -0
  261. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/all_extractors_images_test.py +0 -0
  262. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/api/__init__.py +0 -0
  263. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/api/large_file_test.py +0 -0
  264. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/api/mounted_config_test.py +0 -0
  265. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/dpi_integration_test.py +0 -0
  266. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/multiprocessing/__init__.py +0 -0
  267. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  268. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/ocr/__init__.py +0 -0
  269. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/ocr/device_integration_test.py +0 -0
  270. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  271. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  272. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/pandoc_images_test.py +0 -0
  273. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/pdf_images_test.py +0 -0
  274. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/pdf_real_images_test.py +0 -0
  275. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/pptx_complex_test.py +0 -0
  276. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/pptx_images_test.py +0 -0
  277. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/regression_test.py +0 -0
  278. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/integration/token_reduction_integration_test.py +0 -0
  279. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/interfaces/__init__.py +0 -0
  280. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/interfaces/cli_test.py +0 -0
  281. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/interfaces/mcp_server_test.py +0 -0
  282. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/mcp/__init__.py +0 -0
  283. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/mcp/mcp_server_test.py +0 -0
  284. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/multiprocessing/__init__.py +0 -0
  285. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  286. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/multiprocessing/process_manager_test.py +0 -0
  287. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  288. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/__init__.py +0 -0
  289. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/base_test.py +0 -0
  290. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/easyocr_test.py +0 -0
  291. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/init_test.py +0 -0
  292. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/paddleocr_test.py +0 -0
  293. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/tesseract_test.py +0 -0
  294. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/ocr/tesseract_tsv_test.py +0 -0
  295. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/performance/__init__.py +0 -0
  296. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/performance/large_pdf_perf_test.py +0 -0
  297. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  298. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/contract.txt +0 -0
  299. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/contract_test.txt +0 -0
  300. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/document.docx +0 -0
  301. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/email/sample-email.eml +0 -0
  302. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  303. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/excel.xlsx +0 -0
  304. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/flower-no-text.jpg +0 -0
  305. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/form_test.txt +0 -0
  306. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/french-text.txt +0 -0
  307. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/german-text.txt +0 -0
  308. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/google-doc-document.pdf +0 -0
  309. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/html.html +0 -0
  310. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  311. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/invoice_image.png +0 -0
  312. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/invoice_test.txt +0 -0
  313. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/complex_nested.json +0 -0
  314. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
  315. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
  316. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
  317. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/iss_location.json +0 -0
  318. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
  319. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/package.json +0 -0
  320. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
  321. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/sample-document.json +0 -0
  322. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/json/schema_test.json +0 -0
  323. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  324. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/markdown.md +0 -0
  325. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  326. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/non-searchable.pdf +0 -0
  327. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/ocr-image.jpg +0 -0
  328. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  329. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  330. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  331. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  332. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/receipt_test.txt +0 -0
  333. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/report_test.txt +0 -0
  334. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/sample-contract.pdf +0 -0
  335. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/scanned.pdf +0 -0
  336. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/searchable.pdf +0 -0
  337. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
  338. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/spanish-text.txt +0 -0
  339. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/tables/borderless_table.png +0 -0
  340. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/tables/complex_document.png +0 -0
  341. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/tables/simple_table.png +0 -0
  342. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/test-article.pdf +0 -0
  343. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/test-excel.xls +0 -0
  344. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  345. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/__init__.py +0 -0
  346. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/cache_test.py +0 -0
  347. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/device_test.py +0 -0
  348. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/errors_test.py +0 -0
  349. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/ocr_cache_test.py +0 -0
  350. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/pdf_lock_test.py +0 -0
  351. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/playa_helpers_test.py +0 -0
  352. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/playa_metadata_test.py +0 -0
  353. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/playa_test.py +0 -0
  354. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/process_pool_test.py +0 -0
  355. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/quality_test.py +0 -0
  356. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/ref_test.py +0 -0
  357. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/serialization_test.py +0 -0
  358. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/string_test.py +0 -0
  359. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/sync_test.py +0 -0
  360. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/table_test.py +0 -0
  361. {kreuzberg-3.17.2 → kreuzberg-3.18.0}/tests/utils/tmp_test.py +0 -0
@@ -44,15 +44,21 @@ jobs:
44
44
  uv sync --all-extras --dev
45
45
  shell: bash
46
46
 
47
- - name: Load Cached Pre-Commit Dependencies
48
- id: cached-pre-commit-dependencies
47
+ - name: Install prek
48
+ run: |
49
+ # Install prek using uv (recommended method)
50
+ uv tool install prek
51
+ echo "$HOME/.local/bin" >> $GITHUB_PATH
52
+
53
+ - name: Load Cached Prek Dependencies
54
+ id: cached-prek-dependencies
49
55
  uses: actions/cache@v4
50
56
  with:
51
- path: ~/.cache/pre-commit/
52
- key: pre-commit|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
57
+ path: ~/.cache/prek/
58
+ key: prek|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
53
59
 
54
- - name: Execute Pre-Commit
55
- run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
60
+ - name: Execute Prek
61
+ run: prek run --show-diff-on-failure --color=always --all-files
56
62
 
57
63
  coverage:
58
64
  needs: validate
@@ -119,7 +125,7 @@ jobs:
119
125
  shell: bash
120
126
 
121
127
  - name: Upload Coverage to DeepSource
122
- if: always()
128
+ if: always() && secrets.DEEPSOURCE_DSN != '' && needs.test-pr.result == 'success'
123
129
  env:
124
130
  DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
125
131
  run: |
@@ -214,7 +220,7 @@ jobs:
214
220
 
215
221
  coverage-pr:
216
222
  needs: test-pr
217
- if: github.event_name == 'pull_request' && always()
223
+ if: github.event_name == 'pull_request' && needs.test-pr.result == 'success'
218
224
  runs-on: ubuntu-latest
219
225
  timeout-minutes: 10
220
226
  steps:
@@ -227,7 +233,7 @@ jobs:
227
233
  name: coverage-pr-${{ github.sha }}
228
234
 
229
235
  - name: Upload Coverage to DeepSource
230
- if: always()
236
+ if: always() && secrets.DEEPSOURCE_DSN != '' && needs.test-pr.result == 'success'
231
237
  env:
232
238
  DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
233
239
  run: |
@@ -49,7 +49,7 @@ repos:
49
49
  hooks:
50
50
  - id: pyproject-fmt
51
51
  - repo: https://github.com/astral-sh/ruff-pre-commit
52
- rev: v0.13.1
52
+ rev: v0.13.2
53
53
  hooks:
54
54
  - id: ruff
55
55
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.2
3
+ Version: 3.18.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.10.0
31
+ Requires-Dist: anyio>=4.11.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.14.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.16.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.1
36
+ Requires-Dist: mcp>=1.15.0
37
37
  Requires-Dist: msgspec>=0.18.0
38
38
  Requires-Dist: numpy>=2.0.0
39
39
  Requires-Dist: playa-pdf>=0.7.0
@@ -9,7 +9,7 @@ tasks:
9
9
  desc: "Install dependencies with uv"
10
10
  cmds:
11
11
  - uv sync --all-extras --all-packages
12
- - pre-commit install && pre-commit install -hook-type commit-msg
12
+ - prek install && prek install --hook-type commit-msg
13
13
 
14
14
  update:
15
15
  desc: "Update the dependencies"
@@ -17,7 +17,7 @@ tasks:
17
17
  - uv run uv-bump
18
18
  - cd benchmarks && uv run uv-bump && cd -
19
19
  - uv sync --all-extras --all-packages --upgrade
20
- - pre-commit autoupdate
20
+ - prek autoupdate
21
21
 
22
22
  test:
23
23
  desc: "Run tests with pytest"
@@ -32,7 +32,7 @@ tasks:
32
32
  lint:
33
33
  desc: "Lint code with ruff and docs with markdownlint"
34
34
  cmds:
35
- - pre-commit run --all-files
35
+ - prek run --all-files
36
36
 
37
37
  docs:build:
38
38
  desc: "Build documentation"
@@ -368,9 +368,10 @@ rules:
368
368
  - Fix linting issues: `ruff check --fix`
369
369
  - Type check: `mypy`
370
370
 
371
- ### Pre-commit
372
- - Install hooks: `pre-commit install && pre-commit install --hook-type commit-msg`
373
- - Run manually: `pre-commit run --all-files`
371
+ ### Prek
372
+ - Install prek: `uv tool install prek`
373
+ - Install hooks: `prek install && prek install --hook-type commit-msg`
374
+ - Run manually: `prek run --all-files`
374
375
 
375
376
  ### Documentation
376
377
  - Build docs: `uv run mkdocs build --clean --strict`
@@ -18,10 +18,14 @@ Thank you for contributing to Kreuzberg!
18
18
  uv sync --all-extras --dev
19
19
  ```
20
20
 
21
- 1. **Install pre-commit hooks**:
21
+ 1. **Install prek and hooks**:
22
22
 
23
23
  ```bash
24
- pre-commit install && pre-commit install --hook-type commit-msg
24
+ # Install prek using uv (recommended)
25
+ uv tool install prek
26
+
27
+ # Install git hooks
28
+ prek install && prek install --hook-type commit-msg
25
29
  ```
26
30
 
27
31
  ## Development
@@ -42,8 +46,8 @@ uv run ruff check # Lint
42
46
  uv run ruff check --fix # Auto-fix issues
43
47
  uv run mypy # Type check
44
48
 
45
- # Pre-commit
46
- uv run pre-commit run --all-files # Run all checks manually
49
+ # Prek
50
+ prek run --all-files # Run all checks manually
47
51
 
48
52
  # Documentation
49
53
  uv run mkdocs serve # Serve docs locally
@@ -70,7 +74,7 @@ Use [Conventional Commits](https://www.conventionalcommits.org/):
70
74
 
71
75
  - Python 3.10-3.13 supported
72
76
  - System dependencies (optional): Tesseract, Pandoc
73
- - Pre-commit runs automatically on commit
77
+ - Prek runs automatically on commit
74
78
  - Join our [Discord](https://discord.gg/pXxagNK2zN) for help
75
79
 
76
80
  ## License
@@ -62,7 +62,7 @@ Extract text from one or more files.
62
62
  - Method: `POST`
63
63
  - Content-Type: `multipart/form-data`
64
64
  - Body: One or more files with field name `data`
65
- - **Maximum file size: 1GB per file**
65
+ - **Maximum file size: Configurable via `KREUZBERG_MAX_UPLOAD_SIZE` environment variable (default: 1GB per file)**
66
66
 
67
67
  **Response:**
68
68
 
@@ -463,6 +463,37 @@ The API server uses the default Kreuzberg extraction configuration:
463
463
  - PDF, image, and document extraction is supported
464
464
  - Table extraction with GMFT (if installed)
465
465
 
466
+ ### Environment Variables
467
+
468
+ The API server can be configured using environment variables for production deployments:
469
+
470
+ #### Server Configuration
471
+
472
+ | Variable | Description | Default | Example |
473
+ | -------------------------------- | ---------------------------- | ------------------ | ------------------ |
474
+ | `KREUZBERG_MAX_UPLOAD_SIZE` | Maximum upload size in bytes | `1073741824` (1GB) | `2147483648` (2GB) |
475
+ | `KREUZBERG_ENABLE_OPENTELEMETRY` | Enable OpenTelemetry tracing | `true` | `false` |
476
+
477
+ #### Usage Examples
478
+
479
+ ```bash
480
+ # Set 2GB upload limit
481
+ export KREUZBERG_MAX_UPLOAD_SIZE=2147483648
482
+ litestar --app kreuzberg._api.main:app run
483
+
484
+ # Disable telemetry
485
+ export KREUZBERG_ENABLE_OPENTELEMETRY=false
486
+ uvicorn kreuzberg._api.main:app --host 0.0.0.0 --port 8000
487
+
488
+ # Production settings with Docker
489
+ docker run -p 8000:8000 \
490
+ -e KREUZBERG_MAX_UPLOAD_SIZE=5368709120 \
491
+ -e KREUZBERG_ENABLE_OPENTELEMETRY=true \
492
+ goldziher/kreuzberg:latest
493
+ ```
494
+
495
+ **Note**: Boolean environment variables accept `true`/`false`, `1`/`0`, `yes`/`no`, or `on`/`off` values.
496
+
466
497
  To use custom configuration, modify the extraction call in your own API wrapper:
467
498
 
468
499
  ```python
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import base64
4
4
  import io
5
+ import os
5
6
  import traceback
6
7
  from json import dumps
7
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -100,6 +101,36 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
100
101
  )
101
102
 
102
103
 
104
+ def _get_max_upload_size() -> int:
105
+ """Get the maximum upload size from environment variable.
106
+
107
+ Returns:
108
+ Maximum upload size in bytes. Defaults to 1GB if not set.
109
+
110
+ Environment Variables:
111
+ KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
+ """
113
+ default_size = 1024 * 1024 * 1024 # 1GB
114
+ try:
115
+ size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
+ # Return default if negative
117
+ return size if size >= 0 else default_size
118
+ except ValueError:
119
+ return default_size
120
+
121
+
122
+ def _is_opentelemetry_enabled() -> bool:
123
+ """Check if OpenTelemetry should be enabled.
124
+
125
+ Returns:
126
+ True if OpenTelemetry should be enabled, False otherwise.
127
+
128
+ Environment Variables:
129
+ KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
130
+ """
131
+ return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
132
+
133
+
103
134
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
104
135
  error_type = type(exception).__name__
105
136
  error_message = str(exception)
@@ -242,7 +273,7 @@ async def handle_files_upload( # noqa: PLR0913
242
273
  - Language detection (if enabled)
243
274
 
244
275
  Supports various file formats including PDF, Office documents, images, and more.
245
- Maximum file size: 1GB per file.
276
+ Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
246
277
 
247
278
  Args:
248
279
  request: The HTTP request object
@@ -379,9 +410,18 @@ type_encoders = {
379
410
  Image.Image: _pil_image_encoder,
380
411
  }
381
412
 
413
+
414
+ def _get_plugins() -> list[Any]:
415
+ """Get configured plugins based on environment variables."""
416
+ plugins = []
417
+ if _is_opentelemetry_enabled():
418
+ plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
419
+ return plugins
420
+
421
+
382
422
  app = Litestar(
383
423
  route_handlers=[handle_files_upload, health_check, get_configuration],
384
- plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
424
+ plugins=_get_plugins(),
385
425
  logging_config=StructLoggingConfig(),
386
426
  openapi_config=openapi_config,
387
427
  exception_handlers={
@@ -389,5 +429,5 @@ app = Litestar(
389
429
  Exception: general_exception_handler,
390
430
  },
391
431
  type_encoders=type_encoders,
392
- request_max_body_size=1024 * 1024 * 1024,
432
+ request_max_body_size=_get_max_upload_size(),
393
433
  )
@@ -0,0 +1,244 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+ from functools import lru_cache
8
+ from itertools import chain
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ import anyio
12
+
13
+ from kreuzberg._types import Entity, SpacyEntityExtractionConfig
14
+ from kreuzberg._utils._sync import run_sync
15
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Sequence
19
+
20
+
21
+ def is_uv_available() -> bool:
22
+ """Check if uv is available in the environment."""
23
+ return shutil.which("uv") is not None
24
+
25
+
26
+ def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
27
+ """Get the direct download URL for a spaCy model.
28
+
29
+ Args:
30
+ model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
31
+ version: Model version to download (default: 3.8.0)
32
+
33
+ Returns:
34
+ Direct download URL for the model
35
+ """
36
+ return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
37
+
38
+
39
+ async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
40
+ """Install spaCy model using uv.
41
+
42
+ Args:
43
+ model_name: Name of the spaCy model to install
44
+
45
+ Returns:
46
+ Completed process result
47
+ """
48
+ model_url = get_spacy_model_url(model_name)
49
+ return await run_sync(
50
+ subprocess.run,
51
+ ["uv", "pip", "install", model_url],
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ )
56
+
57
+
58
+ async def install_spacy_model_with_spacy(model_name: str) -> bool:
59
+ """Install spaCy model using spacy download function.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to install
63
+
64
+ Returns:
65
+ True if successful, False otherwise
66
+ """
67
+ try:
68
+ import spacy.cli.download # noqa: PLC0415
69
+
70
+ await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
71
+ return True
72
+ except (ImportError, OSError, RuntimeError):
73
+ return False
74
+
75
+
76
+ def extract_entities(
77
+ text: str,
78
+ entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
79
+ custom_patterns: frozenset[tuple[str, str]] | None = None,
80
+ languages: list[str] | None = None,
81
+ spacy_config: SpacyEntityExtractionConfig | None = None,
82
+ ) -> list[Entity]:
83
+ entities: list[Entity] = []
84
+ if custom_patterns:
85
+ entities.extend(
86
+ chain.from_iterable(
87
+ (
88
+ Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
89
+ for match in re.finditer(pattern, text)
90
+ )
91
+ for ent_type, pattern in custom_patterns
92
+ )
93
+ )
94
+
95
+ if spacy_config is None:
96
+ spacy_config = SpacyEntityExtractionConfig()
97
+
98
+ try:
99
+ import spacy # noqa: F401, PLC0415
100
+ except ImportError as e: # pragma: no cover
101
+ raise MissingDependencyError.create_for_package(
102
+ package_name="spacy",
103
+ dependency_group="entity-extraction",
104
+ functionality="Entity Extraction",
105
+ ) from e
106
+
107
+ model_name = select_spacy_model(languages, spacy_config)
108
+ if not model_name:
109
+ return entities
110
+
111
+ nlp = load_spacy_model(model_name, spacy_config)
112
+
113
+ if len(text) > spacy_config.max_doc_length:
114
+ text = text[: spacy_config.max_doc_length]
115
+
116
+ doc = nlp(text)
117
+
118
+ entity_type_mapping = {etype.upper() for etype in entity_types}
119
+
120
+ entities.extend(
121
+ Entity(
122
+ type=ent.label_,
123
+ text=ent.text,
124
+ start=ent.start_char,
125
+ end=ent.end_char,
126
+ )
127
+ for ent in doc.ents
128
+ if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
129
+ )
130
+
131
+ return entities
132
+
133
+
134
+ @lru_cache(maxsize=32)
135
+ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
136
+ try:
137
+ import spacy # noqa: PLC0415
138
+ except ImportError:
139
+ return None
140
+
141
+ if spacy_config.model_cache_dir:
142
+ os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
143
+
144
+ try:
145
+ nlp = spacy.load(model_name)
146
+ except OSError:
147
+ # Try to download the model automatically
148
+ async def install_model() -> tuple[bool, str | None]:
149
+ """Install model and return success status and error message."""
150
+ # First try spaCy's built-in download
151
+ try:
152
+ success = await install_spacy_model_with_spacy(model_name)
153
+ if success:
154
+ return True, None
155
+ except (ImportError, OSError, RuntimeError) as e:
156
+ spacy_error = str(e)
157
+ else:
158
+ spacy_error = "spaCy download failed"
159
+
160
+ # If spaCy download failed and uv is available, try uv as fallback
161
+ if is_uv_available():
162
+ try:
163
+ result = await install_spacy_model_with_uv(model_name)
164
+ return result.returncode == 0, result.stderr
165
+ except (OSError, subprocess.SubprocessError) as e:
166
+ return False, f"spaCy: {spacy_error}, uv: {e!s}"
167
+
168
+ return False, spacy_error
169
+
170
+ # Run the async installation in a sync context
171
+ try:
172
+ success, error_details = anyio.run(install_model)
173
+ except (OSError, RuntimeError) as e:
174
+ success, error_details = False, str(e)
175
+
176
+ if not success:
177
+ # Generate appropriate error message based on available tools
178
+ if is_uv_available():
179
+ model_url = get_spacy_model_url(model_name)
180
+ manual_install_cmd = f"uv pip install {model_url}"
181
+ else:
182
+ manual_install_cmd = f"python -m spacy download {model_name}"
183
+
184
+ error_msg = (
185
+ f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
186
+ )
187
+
188
+ if error_details:
189
+ error_msg += f"\nError details: {error_details}"
190
+
191
+ raise KreuzbergError(
192
+ error_msg,
193
+ context={
194
+ "model": model_name,
195
+ "manual_install_cmd": manual_install_cmd,
196
+ "error_details": error_details,
197
+ "uv_available": is_uv_available(),
198
+ },
199
+ ) from None
200
+
201
+ try:
202
+ nlp = spacy.load(model_name)
203
+ except OSError as e:
204
+ raise KreuzbergError(
205
+ f"Failed to load spaCy model '{model_name}' even after successful download. "
206
+ f"Please verify your spaCy installation and try reinstalling the model.",
207
+ context={"model": model_name, "error": str(e)},
208
+ ) from e
209
+
210
+ nlp.max_length = spacy_config.max_doc_length
211
+
212
+ return nlp
213
+
214
+
215
+ def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
216
+ if not languages:
217
+ return spacy_config.get_model_for_language("en")
218
+
219
+ for lang in languages:
220
+ model_name = spacy_config.get_model_for_language(lang)
221
+ if model_name:
222
+ return model_name
223
+
224
+ return spacy_config.get_fallback_model()
225
+
226
+
227
+ def extract_keywords(
228
+ text: str,
229
+ keyword_count: int = 10,
230
+ ) -> list[tuple[str, float]]:
231
+ try:
232
+ from keybert import KeyBERT # noqa: PLC0415
233
+
234
+ kw_model = KeyBERT()
235
+ keywords = kw_model.extract_keywords(text, top_n=keyword_count)
236
+ return [(kw, float(score)) for kw, score in keywords]
237
+ except (RuntimeError, OSError, ValueError):
238
+ return []
239
+ except ImportError as e: # pragma: no cover
240
+ raise MissingDependencyError.create_for_package(
241
+ package_name="keybert",
242
+ dependency_group="entity-extraction",
243
+ functionality="Keyword Extraction",
244
+ ) from e
@@ -23,9 +23,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
23
23
  config = LanguageDetectionConfig()
24
24
 
25
25
  try:
26
- # detect always returns a list, use k parameter for multiple languages
27
26
  k = config.top_k if config.multilingual else 1
28
- # Use the model from config directly
29
27
  model = config.model
30
28
  results = detect(text, model=model, k=k)
31
29
 
@@ -76,7 +76,6 @@ def _validate_and_post_process_helper(
76
76
  result.keywords = None
77
77
 
78
78
  if config.auto_detect_language:
79
- # Use provided config or create one with the model from ExtractionConfig
80
79
  lang_config = config.language_detection_config
81
80
  if lang_config is None:
82
81
  from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
5
5
 
6
6
  [project]
7
7
  name = "kreuzberg"
8
- version = "3.17.2"
8
+ version = "3.18.0"
9
9
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -57,12 +57,12 @@ classifiers = [
57
57
  ]
58
58
 
59
59
  dependencies = [
60
- "anyio>=4.10.0",
60
+ "anyio>=4.11.0",
61
61
  "chardetng-py>=0.3.5",
62
62
  "exceptiongroup>=1.2.2; python_version<'3.11'",
63
- "html-to-markdown[lxml]>=1.14.0",
63
+ "html-to-markdown[lxml]>=1.16.0",
64
64
  "langcodes>=3.5.0",
65
- "mcp>=1.14.1",
65
+ "mcp>=1.15.0",
66
66
  "msgspec>=0.18.0",
67
67
  "numpy>=2.0.0",
68
68
  "playa-pdf>=0.7.0",
@@ -111,14 +111,13 @@ scripts.kreuzberg-mcp = "kreuzberg._mcp.server:main"
111
111
  dev = [
112
112
  "covdefaults>=2.3.0",
113
113
  "mypy>=1.18.2",
114
- "pre-commit>=4.3.0",
115
114
  "pytest>=8.4.2",
116
115
  "pytest-cov>=7.0.0",
117
116
  "pytest-mock>=3.15.1",
118
117
  "pytest-rerunfailures>=16.0.1",
119
118
  "pytest-timeout>=2.4.0",
120
119
  "rich>=14.1.0",
121
- "ruff>=0.13.1",
120
+ "ruff>=0.13.2",
122
121
  "tabulate>=0.9.0",
123
122
  "trio>=0.31.0",
124
123
  "uv-bump",