kreuzberg 3.17.0__tar.gz → 3.17.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (360) hide show
  1. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.pre-commit-config.yaml +1 -1
  2. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/PKG-INFO +4 -4
  3. kreuzberg-3.17.2/kreuzberg/_language_detection.py +37 -0
  4. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_types.py +15 -6
  5. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/extraction.py +8 -1
  6. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/pyproject.toml +6 -6
  7. kreuzberg-3.17.2/tests/features/language_detection_test.py +354 -0
  8. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/uv.lock +436 -371
  9. kreuzberg-3.17.0/kreuzberg/_language_detection.py +0 -60
  10. kreuzberg-3.17.0/tests/features/language_detection_test.py +0 -387
  11. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.commitlintrc +0 -0
  12. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.deepsource.toml +0 -0
  13. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.docker/Dockerfile +0 -0
  14. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.docker/README.md +0 -0
  15. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.dockerignore +0 -0
  16. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/dependabot.yaml +0 -0
  17. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/ci.yaml +0 -0
  18. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/docker-e2e-tests.yml +0 -0
  19. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/docs.yml +0 -0
  20. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/pr-title.yaml +0 -0
  21. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/publish-docker.yml +0 -0
  22. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/release.yaml +0 -0
  23. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.github/workflows/test-docker-builds.yml +0 -0
  24. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.gitignore +0 -0
  25. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.markdownlint.yaml +0 -0
  26. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/.prettierignore +0 -0
  27. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/ATTRIBUTIONS.md +0 -0
  28. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/LICENSE +0 -0
  29. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/README.md +0 -0
  30. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/Taskfile.yml +0 -0
  31. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/ai-rulez.yaml +0 -0
  32. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/README.md +0 -0
  33. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/__init__.py +0 -0
  34. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/batch_size_benchmark.py +0 -0
  35. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/batch_validation_benchmark.py +0 -0
  36. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/py.typed +0 -0
  37. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/pyproject.toml +0 -0
  38. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/__init__.py +0 -0
  39. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/__main__.py +0 -0
  40. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/benchmarks.py +0 -0
  41. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/cli.py +0 -0
  42. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/models.py +0 -0
  43. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/profiler.py +0 -0
  44. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/src/runner.py +0 -0
  45. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/benchmarks/token_reduction_compression_benchmark.py +0 -0
  46. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/advanced/custom-extractors.md +0 -0
  47. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/advanced/custom-hooks.md +0 -0
  48. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/advanced/error-handling.md +0 -0
  49. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/advanced/index.md +0 -0
  50. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/advanced/performance.md +0 -0
  51. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/api-reference/exceptions.md +0 -0
  52. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/api-reference/extraction-functions.md +0 -0
  53. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/api-reference/extractor-registry.md +0 -0
  54. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/api-reference/index.md +0 -0
  55. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/api-reference/ocr-configuration.md +0 -0
  56. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/api-reference/types.md +0 -0
  57. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/assets/favicon.png +0 -0
  58. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/assets/logo.png +0 -0
  59. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/cli.md +0 -0
  60. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/contributing.md +0 -0
  61. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/css/extra.css +0 -0
  62. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/examples/extraction-examples.md +0 -0
  63. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/examples/index.md +0 -0
  64. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/getting-started/index.md +0 -0
  65. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/getting-started/installation.md +0 -0
  66. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/getting-started/quick-start.md +0 -0
  67. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/index.md +0 -0
  68. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/api-server.md +0 -0
  69. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/basic-usage.md +0 -0
  70. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/chunking.md +0 -0
  71. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/docker.md +0 -0
  72. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/document-classification.md +0 -0
  73. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/extraction-configuration.md +0 -0
  74. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/index.md +0 -0
  75. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/mcp-server.md +0 -0
  76. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/metadata-extraction.md +0 -0
  77. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/ocr-backends.md +0 -0
  78. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/ocr-configuration.md +0 -0
  79. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/supported-formats.md +0 -0
  80. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/docs/user-guide/token-reduction.md +0 -0
  81. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/__init__.py +0 -0
  82. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/__main__.py +0 -0
  83. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_api/__init__.py +0 -0
  84. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_api/_config_cache.py +0 -0
  85. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_api/main.py +0 -0
  86. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_chunker.py +0 -0
  87. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_config.py +0 -0
  88. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_constants.py +0 -0
  89. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_document_classification.py +0 -0
  90. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_entity_extraction.py +0 -0
  91. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/__init__.py +0 -0
  92. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_base.py +0 -0
  93. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_email.py +0 -0
  94. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_html.py +0 -0
  95. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_image.py +0 -0
  96. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_pandoc.py +0 -0
  97. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_pdf.py +0 -0
  98. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_presentation.py +0 -0
  99. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  100. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_extractors/_structured.py +0 -0
  101. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_gmft.py +0 -0
  102. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_mcp/__init__.py +0 -0
  103. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_mcp/server.py +0 -0
  104. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_mime_types.py +0 -0
  105. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_ocr/__init__.py +0 -0
  106. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_ocr/_base.py +0 -0
  107. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_ocr/_easyocr.py +0 -0
  108. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
  109. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_ocr/_table_extractor.py +0 -0
  110. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_ocr/_tesseract.py +0 -0
  111. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_playa.py +0 -0
  112. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_registry.py +0 -0
  113. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/__init__.py +0 -0
  114. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/_reducer.py +0 -0
  115. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  116. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  117. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  118. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  119. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  120. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  121. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  122. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  123. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  124. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  125. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  126. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  127. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  128. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  129. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  130. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  131. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  132. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  133. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  134. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  135. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  136. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  137. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  138. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  139. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  140. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  141. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  142. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  143. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  144. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  145. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  146. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  147. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  148. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  149. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  150. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  151. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  152. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  153. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  154. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  155. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  156. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  157. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  158. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  159. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  160. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  161. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  162. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  163. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  164. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  165. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  166. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  167. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  168. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  169. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  170. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  171. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  172. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  173. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  174. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  175. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  176. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  177. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  178. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  179. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  180. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/__init__.py +0 -0
  181. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_cache.py +0 -0
  182. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_device.py +0 -0
  183. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_document_cache.py +0 -0
  184. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_errors.py +0 -0
  185. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_html_streaming.py +0 -0
  186. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  187. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_ocr_cache.py +0 -0
  188. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
  189. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_process_pool.py +0 -0
  190. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_quality.py +0 -0
  191. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_ref.py +0 -0
  192. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_resource_managers.py +0 -0
  193. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_serialization.py +0 -0
  194. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_string.py +0 -0
  195. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_sync.py +0 -0
  196. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_table.py +0 -0
  197. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/_utils/_tmp.py +0 -0
  198. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/cli.py +0 -0
  199. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/exceptions.py +0 -0
  200. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/kreuzberg/py.typed +0 -0
  201. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/mkdocs.yaml +0 -0
  202. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/__init__.py +0 -0
  203. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/__init__.py +0 -0
  204. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/config_cache_test.py +0 -0
  205. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/conftest.py +0 -0
  206. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/header_config_hashing_test.py +0 -0
  207. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/image_extraction_test.py +0 -0
  208. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/main_test.py +0 -0
  209. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/api/runtime_config_test.py +0 -0
  210. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/conftest.py +0 -0
  211. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/__init__.py +0 -0
  212. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/comprehensive_config_test.py +0 -0
  213. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/config_test.py +0 -0
  214. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/constants_test.py +0 -0
  215. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/dpi_configuration_test.py +0 -0
  216. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/exceptions_test.py +0 -0
  217. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/extraction_batch_test.py +0 -0
  218. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/extraction_test.py +0 -0
  219. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/html_to_markdown_config_test.py +0 -0
  220. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/image_ocr_result_test.py +0 -0
  221. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/init_test.py +0 -0
  222. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/main_test.py +0 -0
  223. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/mime_types_test.py +0 -0
  224. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/registry_test.py +0 -0
  225. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/core/types_test.py +0 -0
  226. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/e2e/__init__.py +0 -0
  227. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/e2e/docker_e2e.py +0 -0
  228. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/README_image_tests.md +0 -0
  229. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/__init__.py +0 -0
  230. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/base_extractor_test.py +0 -0
  231. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/base_memory_limits_test.py +0 -0
  232. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/base_ocr_processing_test.py +0 -0
  233. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/base_ocr_simple_test.py +0 -0
  234. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/email_error_paths_test.py +0 -0
  235. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/email_test.py +0 -0
  236. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/html_invalid_base64_test.py +0 -0
  237. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/html_test.py +0 -0
  238. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/image_deduplication_test.py +0 -0
  239. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/image_error_handling_test.py +0 -0
  240. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/image_error_simple_test.py +0 -0
  241. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/image_test.py +0 -0
  242. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/json_test.py +0 -0
  243. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/pandoc_metadata_test.py +0 -0
  244. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/pandoc_test.py +0 -0
  245. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/pdf_images_test.py +0 -0
  246. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/pdf_sync_images_test.py +0 -0
  247. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/pdf_test.py +0 -0
  248. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/presentation_test.py +0 -0
  249. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/spreadsheet_test.py +0 -0
  250. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/extractors/structured_test.py +0 -0
  251. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/__init__.py +0 -0
  252. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/chunker_test.py +0 -0
  253. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/document_classification_test.py +0 -0
  254. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/entity_extraction_test.py +0 -0
  255. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/gmft_test.py +0 -0
  256. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/hooks_test.py +0 -0
  257. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/table_extraction_test.py +0 -0
  258. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/features/token_reduction_test.py +0 -0
  259. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/__init__.py +0 -0
  260. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/all_extractors_images_test.py +0 -0
  261. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/api/__init__.py +0 -0
  262. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/api/large_file_test.py +0 -0
  263. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/api/mounted_config_test.py +0 -0
  264. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/dpi_integration_test.py +0 -0
  265. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/multiprocessing/__init__.py +0 -0
  266. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  267. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/ocr/__init__.py +0 -0
  268. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/ocr/device_integration_test.py +0 -0
  269. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  270. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  271. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/pandoc_images_test.py +0 -0
  272. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/pdf_images_test.py +0 -0
  273. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/pdf_real_images_test.py +0 -0
  274. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/pptx_complex_test.py +0 -0
  275. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/pptx_images_test.py +0 -0
  276. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/regression_test.py +0 -0
  277. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/integration/token_reduction_integration_test.py +0 -0
  278. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/interfaces/__init__.py +0 -0
  279. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/interfaces/cli_test.py +0 -0
  280. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/interfaces/mcp_server_test.py +0 -0
  281. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/mcp/__init__.py +0 -0
  282. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/mcp/mcp_server_test.py +0 -0
  283. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/multiprocessing/__init__.py +0 -0
  284. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  285. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/multiprocessing/process_manager_test.py +0 -0
  286. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  287. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/__init__.py +0 -0
  288. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/base_test.py +0 -0
  289. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/easyocr_test.py +0 -0
  290. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/init_test.py +0 -0
  291. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/paddleocr_test.py +0 -0
  292. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/tesseract_test.py +0 -0
  293. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/ocr/tesseract_tsv_test.py +0 -0
  294. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/performance/__init__.py +0 -0
  295. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/performance/large_pdf_perf_test.py +0 -0
  296. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  297. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/contract.txt +0 -0
  298. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/contract_test.txt +0 -0
  299. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/document.docx +0 -0
  300. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/email/sample-email.eml +0 -0
  301. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  302. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/excel.xlsx +0 -0
  303. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/flower-no-text.jpg +0 -0
  304. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/form_test.txt +0 -0
  305. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/french-text.txt +0 -0
  306. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/german-text.txt +0 -0
  307. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/google-doc-document.pdf +0 -0
  308. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/html.html +0 -0
  309. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/images/test_hello_world.png +0 -0
  310. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/invoice_image.png +0 -0
  311. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/invoice_test.txt +0 -0
  312. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/complex_nested.json +0 -0
  313. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
  314. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
  315. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
  316. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/iss_location.json +0 -0
  317. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
  318. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/package.json +0 -0
  319. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
  320. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/sample-document.json +0 -0
  321. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/json/schema_test.json +0 -0
  322. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  323. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/markdown.md +0 -0
  324. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
  325. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/non-searchable.pdf +0 -0
  326. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/ocr-image.jpg +0 -0
  327. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  328. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  329. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  330. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  331. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/receipt_test.txt +0 -0
  332. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/report_test.txt +0 -0
  333. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/sample-contract.pdf +0 -0
  334. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/scanned.pdf +0 -0
  335. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/searchable.pdf +0 -0
  336. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/sharable-web-guide.pdf +0 -0
  337. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/spanish-text.txt +0 -0
  338. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/tables/borderless_table.png +0 -0
  339. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/tables/complex_document.png +0 -0
  340. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/tables/simple_table.png +0 -0
  341. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/test-article.pdf +0 -0
  342. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/test-excel.xls +0 -0
  343. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  344. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/__init__.py +0 -0
  345. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/cache_test.py +0 -0
  346. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/device_test.py +0 -0
  347. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/errors_test.py +0 -0
  348. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/ocr_cache_test.py +0 -0
  349. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/pdf_lock_test.py +0 -0
  350. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/playa_helpers_test.py +0 -0
  351. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/playa_metadata_test.py +0 -0
  352. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/playa_test.py +0 -0
  353. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/process_pool_test.py +0 -0
  354. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/quality_test.py +0 -0
  355. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/ref_test.py +0 -0
  356. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/serialization_test.py +0 -0
  357. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/string_test.py +0 -0
  358. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/sync_test.py +0 -0
  359. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/table_test.py +0 -0
  360. {kreuzberg-3.17.0 → kreuzberg-3.17.2}/tests/utils/tmp_test.py +0 -0
@@ -49,7 +49,7 @@ repos:
49
49
  hooks:
50
50
  - id: pyproject-fmt
51
51
  - repo: https://github.com/astral-sh/ruff-pre-commit
52
- rev: v0.13.0
52
+ rev: v0.13.1
53
53
  hooks:
54
54
  - id: ruff
55
55
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.0
3
+ Version: 3.17.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,14 +31,14 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.13.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.14.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.0
36
+ Requires-Dist: mcp>=1.14.1
37
37
  Requires-Dist: msgspec>=0.18.0
38
38
  Requires-Dist: numpy>=2.0.0
39
39
  Requires-Dist: playa-pdf>=0.7.0
40
40
  Requires-Dist: polars>=1.33.1
41
- Requires-Dist: psutil>=7.0.0
41
+ Requires-Dist: psutil>=7.1.0
42
42
  Requires-Dist: pypdfium2==4.30.0
43
43
  Requires-Dist: python-calamine>=0.5.3
44
44
  Requires-Dist: python-pptx>=1.0.2
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+
5
+ from kreuzberg._types import LanguageDetectionConfig
6
+ from kreuzberg.exceptions import MissingDependencyError
7
+
8
+ _CACHE_SIZE = 128
9
+
10
+
11
+ @lru_cache(maxsize=_CACHE_SIZE)
12
+ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
13
+ try:
14
+ from fast_langdetect import detect # noqa: PLC0415
15
+ except ImportError as e:
16
+ raise MissingDependencyError.create_for_package(
17
+ dependency_group="langdetect",
18
+ functionality="language detection",
19
+ package_name="fast-langdetect",
20
+ ) from e
21
+
22
+ if config is None:
23
+ config = LanguageDetectionConfig()
24
+
25
+ try:
26
+ # detect always returns a list, use k parameter for multiple languages
27
+ k = config.top_k if config.multilingual else 1
28
+ # Use the model from config directly
29
+ model = config.model
30
+ results = detect(text, model=model, k=k)
31
+
32
+ if results:
33
+ langs = [result["lang"].lower() for result in results if result.get("lang")]
34
+ return langs if langs else None
35
+ return None
36
+ except Exception: # noqa: BLE001
37
+ return None
@@ -402,9 +402,12 @@ class ImageOCRConfig(ConfigDict):
402
402
 
403
403
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
404
404
  class LanguageDetectionConfig(ConfigDict):
405
- low_memory: bool = True
406
- """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
407
- Defaults to True for better memory efficiency."""
405
+ model: Literal["lite", "full", "auto"] = "auto"
406
+ """Language detection model to use:
407
+ - 'lite': Smaller, faster model with good accuracy
408
+ - 'full': Larger model with highest accuracy
409
+ - 'auto': Automatically choose based on memory availability (default)
410
+ """
408
411
  top_k: int = 3
409
412
  """Maximum number of languages to return for multilingual detection."""
410
413
  multilingual: bool = False
@@ -412,8 +415,8 @@ class LanguageDetectionConfig(ConfigDict):
412
415
  If False, uses single language detection."""
413
416
  cache_dir: str | None = None
414
417
  """Custom directory for model cache. If None, uses system default."""
415
- allow_fallback: bool = True
416
- """If True, falls back to small model if large model fails."""
418
+ low_memory: bool = True
419
+ """Deprecated. Use 'model' parameter instead. If True, uses 'lite' model."""
417
420
 
418
421
 
419
422
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
@@ -983,8 +986,14 @@ class ExtractionConfig(ConfigDict):
983
986
  """Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
984
987
  auto_detect_language: bool = False
985
988
  """Whether to automatically detect language and configure OCR accordingly."""
989
+ language_detection_model: Literal["lite", "full", "auto"] = "auto"
990
+ """Language detection model to use when auto_detect_language is True.
991
+ - 'lite': Smaller, faster model with good accuracy
992
+ - 'full': Larger model with highest accuracy
993
+ - 'auto': Automatically choose based on memory availability (default)
994
+ """
986
995
  language_detection_config: LanguageDetectionConfig | None = None
987
- """Configuration for language detection. If None, uses default settings."""
996
+ """Configuration for language detection. If None, uses default settings with language_detection_model."""
988
997
  spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
989
998
  """Configuration for spaCy entity extraction. If None, uses default settings."""
990
999
  auto_detect_document_type: bool = False
@@ -76,9 +76,16 @@ def _validate_and_post_process_helper(
76
76
  result.keywords = None
77
77
 
78
78
  if config.auto_detect_language:
79
+ # Use provided config or create one with the model from ExtractionConfig
80
+ lang_config = config.language_detection_config
81
+ if lang_config is None:
82
+ from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
83
+
84
+ lang_config = LanguageDetectionConfig(model=config.language_detection_model)
85
+
79
86
  result.detected_languages = detect_languages(
80
87
  result.content,
81
- config=config.language_detection_config,
88
+ config=lang_config,
82
89
  )
83
90
 
84
91
  if config.auto_detect_document_type:
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
5
5
 
6
6
  [project]
7
7
  name = "kreuzberg"
8
- version = "3.17.0"
8
+ version = "3.17.2"
9
9
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -60,14 +60,14 @@ dependencies = [
60
60
  "anyio>=4.10.0",
61
61
  "chardetng-py>=0.3.5",
62
62
  "exceptiongroup>=1.2.2; python_version<'3.11'",
63
- "html-to-markdown[lxml]>=1.13.0",
63
+ "html-to-markdown[lxml]>=1.14.0",
64
64
  "langcodes>=3.5.0",
65
- "mcp>=1.14.0",
65
+ "mcp>=1.14.1",
66
66
  "msgspec>=0.18.0",
67
67
  "numpy>=2.0.0",
68
68
  "playa-pdf>=0.7.0",
69
69
  "polars>=1.33.1",
70
- "psutil>=7.0.0",
70
+ "psutil>=7.1.0",
71
71
  "pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
72
72
  "python-calamine>=0.5.3",
73
73
  "python-pptx>=1.0.2",
@@ -110,7 +110,7 @@ scripts.kreuzberg-mcp = "kreuzberg._mcp.server:main"
110
110
  [dependency-groups]
111
111
  dev = [
112
112
  "covdefaults>=2.3.0",
113
- "mypy>=1.18.1",
113
+ "mypy>=1.18.2",
114
114
  "pre-commit>=4.3.0",
115
115
  "pytest>=8.4.2",
116
116
  "pytest-cov>=7.0.0",
@@ -118,7 +118,7 @@ dev = [
118
118
  "pytest-rerunfailures>=16.0.1",
119
119
  "pytest-timeout>=2.4.0",
120
120
  "rich>=14.1.0",
121
- "ruff>=0.13.0",
121
+ "ruff>=0.13.1",
122
122
  "tabulate>=0.9.0",
123
123
  "trio>=0.31.0",
124
124
  "uv-bump",
@@ -0,0 +1,354 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+ from unittest.mock import Mock, patch
5
+
6
+ import pytest
7
+
8
+ from kreuzberg._language_detection import detect_languages
9
+ from kreuzberg._types import LanguageDetectionConfig
10
+ from kreuzberg.exceptions import MissingDependencyError
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Generator
14
+
15
+
16
+ @pytest.fixture(autouse=True)
17
+ def clear_language_detection_cache() -> Generator[None, None, None]:
18
+ detect_languages.cache_clear()
19
+ yield
20
+ detect_languages.cache_clear()
21
+
22
+
23
+ def test_detect_languages_when_library_missing() -> None:
24
+ text = "This is some English text."
25
+
26
+ # Mock the import statement inside the function
27
+ with patch.dict("sys.modules", {"fast_langdetect": None}):
28
+ with pytest.raises(MissingDependencyError) as exc_info:
29
+ detect_languages(text)
30
+
31
+ error = exc_info.value
32
+ assert "fast-langdetect" in str(error)
33
+ assert "language detection" in str(error)
34
+
35
+
36
+ def test_detect_languages_single_language_success() -> None:
37
+ text = "This is some English text."
38
+ mock_detect_result = [{"lang": "EN", "score": 0.99}]
39
+
40
+ mock_detect = Mock(return_value=mock_detect_result)
41
+
42
+ with patch("fast_langdetect.detect", mock_detect):
43
+ result = detect_languages(text)
44
+
45
+ assert result == ["en"]
46
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
47
+
48
+
49
+ def test_detect_languages_single_language_no_lang_key() -> None:
50
+ text = "This is some text."
51
+ mock_detect_result = [{"score": 0.50}]
52
+
53
+ mock_detect = Mock(return_value=mock_detect_result)
54
+
55
+ with patch("fast_langdetect.detect", mock_detect):
56
+ result = detect_languages(text)
57
+
58
+ assert result is None
59
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
60
+
61
+
62
+ def test_detect_languages_single_language_empty_lang() -> None:
63
+ text = "This is some text."
64
+ mock_detect_result = [{"lang": "", "score": 0.50}]
65
+
66
+ mock_detect = Mock(return_value=mock_detect_result)
67
+
68
+ with patch("fast_langdetect.detect", mock_detect):
69
+ result = detect_languages(text)
70
+
71
+ assert result is None
72
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
73
+
74
+
75
+ def test_detect_languages_single_language_none_result() -> None:
76
+ text = "This is some text."
77
+ mock_detect = Mock(return_value=None)
78
+
79
+ with patch("fast_langdetect.detect", mock_detect):
80
+ result = detect_languages(text)
81
+
82
+ assert result is None
83
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
84
+
85
+
86
+ def test_detect_languages_multilingual_success() -> None:
87
+ text = "Hello world. Bonjour le monde."
88
+ config = LanguageDetectionConfig(multilingual=True, top_k=3)
89
+
90
+ mock_multilingual_results = [
91
+ {"lang": "EN", "score": 0.8},
92
+ {"lang": "FR", "score": 0.7},
93
+ {"lang": "ES", "score": 0.1},
94
+ ]
95
+
96
+ mock_detect = Mock(return_value=mock_multilingual_results)
97
+
98
+ with patch("fast_langdetect.detect", mock_detect):
99
+ result = detect_languages(text, config)
100
+
101
+ assert result == ["en", "fr", "es"]
102
+ mock_detect.assert_called_once_with(text, model="auto", k=3)
103
+
104
+
105
+ def test_detect_languages_multilingual_with_top_k() -> None:
106
+ text = "Hello world. Bonjour le monde."
107
+ config = LanguageDetectionConfig(multilingual=True, top_k=2)
108
+
109
+ mock_multilingual_results = [{"lang": "EN", "score": 0.8}, {"lang": "FR", "score": 0.7}]
110
+
111
+ mock_detect = Mock(return_value=mock_multilingual_results)
112
+
113
+ with patch("fast_langdetect.detect", mock_detect):
114
+ result = detect_languages(text, config)
115
+
116
+ assert result == ["en", "fr"]
117
+ mock_detect.assert_called_once_with(text, model="auto", k=2)
118
+
119
+
120
+ def test_detect_languages_multilingual_results_missing_lang() -> None:
121
+ text = "Mixed language text."
122
+ config = LanguageDetectionConfig(multilingual=True)
123
+
124
+ mock_multilingual_results = [
125
+ {"lang": "EN", "score": 0.8},
126
+ {"score": 0.6},
127
+ {"lang": "", "score": 0.4},
128
+ {"lang": "FR", "score": 0.3},
129
+ ]
130
+
131
+ mock_detect = Mock(return_value=mock_multilingual_results)
132
+
133
+ with patch("fast_langdetect.detect", mock_detect):
134
+ result = detect_languages(text, config)
135
+
136
+ assert result == ["en", "fr"]
137
+
138
+
139
+ def test_detect_languages_with_default_config() -> None:
140
+ text = "This is English text."
141
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
142
+
143
+ mock_detect = Mock(return_value=mock_detect_result)
144
+
145
+ with patch("fast_langdetect.detect", mock_detect):
146
+ result = detect_languages(text, config=None)
147
+
148
+ assert result == ["en"]
149
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
150
+
151
+
152
+ def test_detect_languages_single_language_with_config() -> None:
153
+ text = "This is English text."
154
+ config = LanguageDetectionConfig(multilingual=False)
155
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
156
+
157
+ mock_detect = Mock(return_value=mock_detect_result)
158
+
159
+ with patch("fast_langdetect.detect", mock_detect):
160
+ result = detect_languages(text, config)
161
+
162
+ assert result == ["en"]
163
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
164
+
165
+
166
+ def test_detect_languages_exception_handling() -> None:
167
+ text = "This is some text."
168
+
169
+ mock_detect = Mock(side_effect=RuntimeError("Detection failed"))
170
+
171
+ with patch("fast_langdetect.detect", mock_detect):
172
+ result = detect_languages(text)
173
+
174
+ assert result is None
175
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
176
+
177
+
178
+ def test_detect_languages_multilingual_exception_handling() -> None:
179
+ text = "Mixed language text."
180
+ config = LanguageDetectionConfig(multilingual=True)
181
+
182
+ mock_detect = Mock(side_effect=ValueError("Multilingual detection failed"))
183
+
184
+ with patch("fast_langdetect.detect", mock_detect):
185
+ result = detect_languages(text, config)
186
+
187
+ assert result is None
188
+ mock_detect.assert_called_once_with(text, model="auto", k=3)
189
+
190
+
191
+ def test_detect_languages_caching_behavior() -> None:
192
+ text = "This is English text."
193
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
194
+
195
+ mock_detect = Mock(return_value=mock_detect_result)
196
+
197
+ with patch("fast_langdetect.detect", mock_detect):
198
+ config = LanguageDetectionConfig()
199
+ result1 = detect_languages(text, config)
200
+ result2 = detect_languages(text, config)
201
+
202
+ assert result1 == ["en"]
203
+ assert result2 == ["en"]
204
+ mock_detect.assert_called_once_with(text, model="auto", k=1)
205
+
206
+
207
+ def test_detect_languages_cache_different_configs() -> None:
208
+ text = "This is English text."
209
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
210
+
211
+ mock_detect = Mock(return_value=mock_detect_result)
212
+
213
+ with patch("fast_langdetect.detect", mock_detect):
214
+ config1 = LanguageDetectionConfig(multilingual=False)
215
+ config2 = LanguageDetectionConfig(multilingual=True, top_k=2)
216
+
217
+ result1 = detect_languages(text, config1)
218
+ result2 = detect_languages(text, config2)
219
+
220
+ assert result1 == ["en"]
221
+ assert result2 == ["en"]
222
+ assert mock_detect.call_count == 2
223
+ mock_detect.assert_any_call(text, model="auto", k=1)
224
+ mock_detect.assert_any_call(text, model="auto", k=2)
225
+
226
+
227
+ # Real integration tests without mocks
228
+ def test_detect_languages_real_single_language() -> None:
229
+ text = "This is definitely an English text with multiple sentences. It should be detected as English."
230
+ result = detect_languages(text)
231
+
232
+ assert result is not None
233
+ assert len(result) == 1
234
+ assert result[0] == "en"
235
+
236
+
237
+ def test_detect_languages_real_multilingual() -> None:
238
+ # Text with mixed languages
239
+ text = "Hello world. Bonjour le monde. Hola mundo. Ciao mondo."
240
+ config = LanguageDetectionConfig(multilingual=True, top_k=4)
241
+ result = detect_languages(text, config)
242
+
243
+ assert result is not None
244
+ assert len(result) >= 1
245
+ # The exact languages detected may vary, but we should get at least one
246
+ assert all(isinstance(lang, str) for lang in result)
247
+ assert all(len(lang) == 2 for lang in result) # Language codes should be 2 chars
248
+
249
+
250
+ def test_detect_languages_real_empty_text() -> None:
251
+ text = ""
252
+ result = detect_languages(text)
253
+
254
+ # Empty text should return None or raise an exception (caught and returns None)
255
+ # Note: fast_langdetect may return a default language for empty text
256
+ assert result is None or (isinstance(result, list) and len(result) <= 1)
257
+
258
+
259
+ def test_detect_languages_real_with_config() -> None:
260
+ text = "This is an English sentence."
261
+ config = LanguageDetectionConfig(multilingual=False)
262
+ result = detect_languages(text, config)
263
+
264
+ assert result is not None
265
+ assert len(result) == 1
266
+ assert result[0] == "en"
267
+
268
+
269
+ def test_detect_languages_real_french_text() -> None:
270
+ text = "Ceci est un texte en français. Il devrait être détecté comme français."
271
+ result = detect_languages(text)
272
+
273
+ assert result is not None
274
+ assert len(result) == 1
275
+ # Note: Model accuracy may vary, checking that we get a valid language code
276
+ assert isinstance(result[0], str)
277
+ assert len(result[0]) == 2
278
+
279
+
280
+ def test_detect_languages_real_german_text() -> None:
281
+ text = "Dies ist ein deutscher Text. Es sollte als Deutsch erkannt werden."
282
+ result = detect_languages(text)
283
+
284
+ assert result is not None
285
+ assert len(result) == 1
286
+ # Note: Model accuracy may vary, checking that we get a valid language code
287
+ assert isinstance(result[0], str)
288
+ assert len(result[0]) == 2
289
+
290
+
291
+ def test_detect_languages_real_spanish_text() -> None:
292
+ text = "Este es un texto en español. Debería ser detectado como español."
293
+ result = detect_languages(text)
294
+
295
+ assert result is not None
296
+ assert len(result) == 1
297
+ # Note: Model accuracy may vary, checking that we get a valid language code
298
+ assert isinstance(result[0], str)
299
+ assert len(result[0]) == 2
300
+
301
+
302
+ def test_detect_languages_real_mixed_languages_with_top_k() -> None:
303
+ # Text with multiple languages - should detect top languages
304
+ text = "English text. Texte français. Deutscher Text. Texto español."
305
+ config = LanguageDetectionConfig(multilingual=True, top_k=2)
306
+ result = detect_languages(text, config)
307
+
308
+ assert result is not None
309
+ # Should detect at least 1, up to 2 languages
310
+ assert 1 <= len(result) <= 2
311
+ assert all(isinstance(lang, str) for lang in result)
312
+ assert all(len(lang) == 2 for lang in result)
313
+
314
+
315
+ def test_detect_languages_with_lite_model() -> None:
316
+ text = "This is English text."
317
+ config = LanguageDetectionConfig(model="lite")
318
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
319
+
320
+ mock_detect = Mock(return_value=mock_detect_result)
321
+
322
+ with patch("fast_langdetect.detect", mock_detect):
323
+ result = detect_languages(text, config)
324
+
325
+ assert result == ["en"]
326
+ mock_detect.assert_called_once_with(text, model="lite", k=1)
327
+
328
+
329
+ def test_detect_languages_with_full_model() -> None:
330
+ text = "This is English text."
331
+ config = LanguageDetectionConfig(model="full")
332
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
333
+
334
+ mock_detect = Mock(return_value=mock_detect_result)
335
+
336
+ with patch("fast_langdetect.detect", mock_detect):
337
+ result = detect_languages(text, config)
338
+
339
+ assert result == ["en"]
340
+ mock_detect.assert_called_once_with(text, model="full", k=1)
341
+
342
+
343
+ def test_detect_languages_with_auto_model() -> None:
344
+ text = "This is English text."
345
+ config = LanguageDetectionConfig(model="auto")
346
+ mock_detect_result = [{"lang": "EN", "score": 0.95}]
347
+
348
+ mock_detect = Mock(return_value=mock_detect_result)
349
+
350
+ with patch("fast_langdetect.detect", mock_detect):
351
+ result = detect_languages(text, config)
352
+
353
+ assert result == ["en"]
354
+ mock_detect.assert_called_once_with(text, model="auto", k=1)