kreuzberg 3.16.0__tar.gz → 3.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (362) hide show
  1. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/ci.yaml +42 -72
  2. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/docker-e2e-tests.yml +4 -0
  3. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/docs.yml +1 -1
  4. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/test-docker-builds.yml +4 -0
  5. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.pre-commit-config.yaml +4 -3
  6. kreuzberg-3.17.1/.prettierignore +1 -0
  7. kreuzberg-3.17.1/ATTRIBUTIONS.md +47 -0
  8. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/PKG-INFO +6 -5
  9. kreuzberg-3.17.1/benchmarks/token_reduction_compression_benchmark.py +268 -0
  10. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/types.md +6 -0
  11. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/index.md +1 -0
  12. kreuzberg-3.17.1/docs/user-guide/token-reduction.md +251 -0
  13. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/__init__.py +2 -0
  14. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_config.py +8 -9
  15. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_base.py +0 -46
  16. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_html.py +1 -1
  17. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_pandoc.py +2 -2
  18. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_pdf.py +4 -4
  19. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_gmft.py +2 -2
  20. kreuzberg-3.17.1/kreuzberg/_language_detection.py +37 -0
  21. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_mcp/server.py +1 -1
  22. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_mime_types.py +1 -1
  23. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_easyocr.py +4 -9
  24. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_paddleocr.py +1 -1
  25. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_tesseract.py +15 -25
  26. kreuzberg-3.17.1/kreuzberg/_token_reduction/__init__.py +11 -0
  27. kreuzberg-3.17.1/kreuzberg/_token_reduction/_reducer.py +439 -0
  28. kreuzberg-3.17.1/kreuzberg/_token_reduction/_stopwords.py +116 -0
  29. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  30. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  31. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  32. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  33. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  34. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  35. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  36. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  37. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  38. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  39. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  40. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  41. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  42. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  43. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  44. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  45. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  46. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  47. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  48. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  49. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  50. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  51. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  52. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  53. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  54. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  55. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  56. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  57. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  58. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  59. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  60. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  61. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  62. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  63. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  64. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  65. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  66. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  67. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  68. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  69. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  70. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  71. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  72. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  73. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  74. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  75. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  76. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  77. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  78. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  79. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  80. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  81. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  82. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  83. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  84. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  85. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  86. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  87. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  88. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  89. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  90. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  91. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  92. kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  93. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_types.py +50 -9
  94. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_image_preprocessing.py +1 -1
  95. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_ref.py +14 -6
  96. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/exceptions.py +0 -1
  97. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/extraction.py +33 -10
  98. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/pyproject.toml +10 -8
  99. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/config_cache_test.py +3 -27
  100. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/comprehensive_config_test.py +61 -0
  101. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/types_test.py +62 -0
  102. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_extractor_test.py +1 -1
  103. kreuzberg-3.17.1/tests/features/language_detection_test.py +354 -0
  104. kreuzberg-3.17.1/tests/features/token_reduction_test.py +813 -0
  105. kreuzberg-3.17.1/tests/integration/token_reduction_integration_test.py +173 -0
  106. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/tesseract_test.py +64 -0
  107. kreuzberg-3.17.1/tests/utils/playa_helpers_test.py +0 -0
  108. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/uv.lock +304 -255
  109. kreuzberg-3.16.0/docker-logs/docker-info.txt +0 -60
  110. kreuzberg-3.16.0/docker-logs/docker-version.txt +0 -27
  111. kreuzberg-3.16.0/kreuzberg/_language_detection.py +0 -60
  112. kreuzberg-3.16.0/tests/features/language_detection_test.py +0 -387
  113. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.commitlintrc +0 -0
  114. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.deepsource.toml +0 -0
  115. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.docker/Dockerfile +0 -0
  116. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.docker/README.md +0 -0
  117. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.dockerignore +0 -0
  118. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/dependabot.yaml +0 -0
  119. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/pr-title.yaml +0 -0
  120. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/publish-docker.yml +0 -0
  121. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/release.yaml +0 -0
  122. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.gitignore +0 -0
  123. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.markdownlint.yaml +0 -0
  124. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/LICENSE +0 -0
  125. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/README.md +0 -0
  126. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/Taskfile.yml +0 -0
  127. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/ai-rulez.yaml +0 -0
  128. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/README.md +0 -0
  129. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/__init__.py +0 -0
  130. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/batch_size_benchmark.py +0 -0
  131. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/batch_validation_benchmark.py +0 -0
  132. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/py.typed +0 -0
  133. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/pyproject.toml +0 -0
  134. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/__init__.py +0 -0
  135. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/__main__.py +0 -0
  136. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/benchmarks.py +0 -0
  137. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/cli.py +0 -0
  138. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/models.py +0 -0
  139. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/profiler.py +0 -0
  140. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/runner.py +0 -0
  141. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/custom-extractors.md +0 -0
  142. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/custom-hooks.md +0 -0
  143. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/error-handling.md +0 -0
  144. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/index.md +0 -0
  145. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/performance.md +0 -0
  146. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/exceptions.md +0 -0
  147. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/extraction-functions.md +0 -0
  148. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/extractor-registry.md +0 -0
  149. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/index.md +0 -0
  150. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/ocr-configuration.md +0 -0
  151. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/assets/favicon.png +0 -0
  152. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/assets/logo.png +0 -0
  153. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/cli.md +0 -0
  154. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/contributing.md +0 -0
  155. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/css/extra.css +0 -0
  156. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/examples/extraction-examples.md +0 -0
  157. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/examples/index.md +0 -0
  158. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/getting-started/index.md +0 -0
  159. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/getting-started/installation.md +0 -0
  160. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/getting-started/quick-start.md +0 -0
  161. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/index.md +0 -0
  162. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/api-server.md +0 -0
  163. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/basic-usage.md +0 -0
  164. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/chunking.md +0 -0
  165. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/docker.md +0 -0
  166. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/document-classification.md +0 -0
  167. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/extraction-configuration.md +0 -0
  168. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/mcp-server.md +0 -0
  169. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/metadata-extraction.md +0 -0
  170. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/ocr-backends.md +0 -0
  171. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/ocr-configuration.md +0 -0
  172. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/supported-formats.md +0 -0
  173. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/__main__.py +0 -0
  174. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_api/__init__.py +0 -0
  175. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_api/_config_cache.py +0 -0
  176. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_api/main.py +0 -0
  177. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_chunker.py +0 -0
  178. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_constants.py +0 -0
  179. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_document_classification.py +0 -0
  180. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_entity_extraction.py +0 -0
  181. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/__init__.py +0 -0
  182. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_email.py +0 -0
  183. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_image.py +0 -0
  184. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_presentation.py +0 -0
  185. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  186. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_structured.py +0 -0
  187. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_mcp/__init__.py +0 -0
  188. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/__init__.py +0 -0
  189. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_base.py +0 -0
  190. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
  191. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_playa.py +0 -0
  192. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_registry.py +0 -0
  193. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/__init__.py +0 -0
  194. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_cache.py +0 -0
  195. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_device.py +0 -0
  196. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_document_cache.py +0 -0
  197. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_errors.py +0 -0
  198. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_html_streaming.py +0 -0
  199. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
  200. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  201. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_process_pool.py +0 -0
  202. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_quality.py +0 -0
  203. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_resource_managers.py +0 -0
  204. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_serialization.py +0 -0
  205. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_string.py +0 -0
  206. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_sync.py +0 -0
  207. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_table.py +0 -0
  208. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_tmp.py +0 -0
  209. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/cli.py +0 -0
  210. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/py.typed +0 -0
  211. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/mkdocs.yaml +0 -0
  212. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/__init__.py +0 -0
  213. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/__init__.py +0 -0
  214. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/conftest.py +0 -0
  215. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/header_config_hashing_test.py +0 -0
  216. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/image_extraction_test.py +0 -0
  217. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/main_test.py +0 -0
  218. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/runtime_config_test.py +0 -0
  219. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/conftest.py +0 -0
  220. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/__init__.py +0 -0
  221. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/config_test.py +0 -0
  222. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/constants_test.py +0 -0
  223. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/dpi_configuration_test.py +0 -0
  224. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/exceptions_test.py +0 -0
  225. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/extraction_batch_test.py +0 -0
  226. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/extraction_test.py +0 -0
  227. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/html_to_markdown_config_test.py +0 -0
  228. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/image_ocr_result_test.py +0 -0
  229. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/init_test.py +0 -0
  230. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/main_test.py +0 -0
  231. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/mime_types_test.py +0 -0
  232. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/registry_test.py +0 -0
  233. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/e2e/__init__.py +0 -0
  234. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/e2e/docker_e2e.py +0 -0
  235. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/README_image_tests.md +0 -0
  236. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/__init__.py +0 -0
  237. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_memory_limits_test.py +0 -0
  238. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_ocr_processing_test.py +0 -0
  239. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_ocr_simple_test.py +0 -0
  240. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/email_error_paths_test.py +0 -0
  241. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/email_test.py +0 -0
  242. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/html_invalid_base64_test.py +0 -0
  243. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/html_test.py +0 -0
  244. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_deduplication_test.py +0 -0
  245. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_error_handling_test.py +0 -0
  246. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_error_simple_test.py +0 -0
  247. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_test.py +0 -0
  248. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/json_test.py +0 -0
  249. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  250. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pandoc_test.py +0 -0
  251. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pdf_images_test.py +0 -0
  252. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pdf_sync_images_test.py +0 -0
  253. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pdf_test.py +0 -0
  254. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/presentation_test.py +0 -0
  255. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/spreadsheet_test.py +0 -0
  256. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/structured_test.py +0 -0
  257. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/__init__.py +0 -0
  258. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/chunker_test.py +0 -0
  259. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/document_classification_test.py +0 -0
  260. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/entity_extraction_test.py +0 -0
  261. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/gmft_test.py +0 -0
  262. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/hooks_test.py +0 -0
  263. /kreuzberg-3.16.0/tests/integration/__init__.py → /kreuzberg-3.17.1/tests/features/table_extraction_test.py +0 -0
  264. {kreuzberg-3.16.0/tests/integration/api → kreuzberg-3.17.1/tests/integration}/__init__.py +0 -0
  265. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/all_extractors_images_test.py +0 -0
  266. {kreuzberg-3.16.0/tests/integration/multiprocessing → kreuzberg-3.17.1/tests/integration/api}/__init__.py +0 -0
  267. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/api/large_file_test.py +0 -0
  268. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/api/mounted_config_test.py +0 -0
  269. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/dpi_integration_test.py +0 -0
  270. {kreuzberg-3.16.0/tests/integration/ocr → kreuzberg-3.17.1/tests/integration/multiprocessing}/__init__.py +0 -0
  271. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  272. {kreuzberg-3.16.0/tests/interfaces → kreuzberg-3.17.1/tests/integration/ocr}/__init__.py +0 -0
  273. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/ocr/device_integration_test.py +0 -0
  274. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  275. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  276. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pandoc_images_test.py +0 -0
  277. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pdf_images_test.py +0 -0
  278. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pdf_real_images_test.py +0 -0
  279. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pptx_complex_test.py +0 -0
  280. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pptx_images_test.py +0 -0
  281. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/regression_test.py +0 -0
  282. {kreuzberg-3.16.0/tests/mcp → kreuzberg-3.17.1/tests/interfaces}/__init__.py +0 -0
  283. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/interfaces/cli_test.py +0 -0
  284. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/interfaces/mcp_server_test.py +0 -0
  285. {kreuzberg-3.16.0/tests/multiprocessing → kreuzberg-3.17.1/tests/mcp}/__init__.py +0 -0
  286. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/mcp/mcp_server_test.py +0 -0
  287. {kreuzberg-3.16.0/tests/ocr → kreuzberg-3.17.1/tests/multiprocessing}/__init__.py +0 -0
  288. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  289. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/multiprocessing/process_manager_test.py +0 -0
  290. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  291. {kreuzberg-3.16.0/tests/performance → kreuzberg-3.17.1/tests/ocr}/__init__.py +0 -0
  292. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/base_test.py +0 -0
  293. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/easyocr_test.py +0 -0
  294. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/init_test.py +0 -0
  295. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/paddleocr_test.py +0 -0
  296. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/tesseract_tsv_test.py +0 -0
  297. {kreuzberg-3.16.0/tests/utils → kreuzberg-3.17.1/tests/performance}/__init__.py +0 -0
  298. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/performance/large_pdf_perf_test.py +0 -0
  299. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  300. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/contract.txt +0 -0
  301. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/contract_test.txt +0 -0
  302. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/document.docx +0 -0
  303. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/email/sample-email.eml +0 -0
  304. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  305. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/excel.xlsx +0 -0
  306. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/flower-no-text.jpg +0 -0
  307. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/form_test.txt +0 -0
  308. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/french-text.txt +0 -0
  309. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/german-text.txt +0 -0
  310. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/google-doc-document.pdf +0 -0
  311. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/html.html +0 -0
  312. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/images/test_hello_world.png +0 -0
  313. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/invoice_image.png +0 -0
  314. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/invoice_test.txt +0 -0
  315. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/complex_nested.json +0 -0
  316. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
  317. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
  318. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
  319. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/iss_location.json +0 -0
  320. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
  321. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/package.json +0 -0
  322. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
  323. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/sample-document.json +0 -0
  324. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/schema_test.json +0 -0
  325. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  326. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/markdown.md +0 -0
  327. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  328. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/non-searchable.pdf +0 -0
  329. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/ocr-image.jpg +0 -0
  330. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  331. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  332. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  333. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  334. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/receipt_test.txt +0 -0
  335. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/report_test.txt +0 -0
  336. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/sample-contract.pdf +0 -0
  337. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/scanned.pdf +0 -0
  338. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/searchable.pdf +0 -0
  339. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/sharable-web-guide.pdf +0 -0
  340. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/spanish-text.txt +0 -0
  341. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/borderless_table.png +0 -0
  342. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/complex_document.png +0 -0
  343. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/simple_table.png +0 -0
  344. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/test-article.pdf +0 -0
  345. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/test-excel.xls +0 -0
  346. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  347. /kreuzberg-3.16.0/tests/utils/playa_helpers_test.py → /kreuzberg-3.17.1/tests/utils/__init__.py +0 -0
  348. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/cache_test.py +0 -0
  349. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/device_test.py +0 -0
  350. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/errors_test.py +0 -0
  351. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/ocr_cache_test.py +0 -0
  352. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/pdf_lock_test.py +0 -0
  353. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/playa_metadata_test.py +0 -0
  354. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/playa_test.py +0 -0
  355. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/process_pool_test.py +0 -0
  356. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/quality_test.py +0 -0
  357. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/ref_test.py +0 -0
  358. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/serialization_test.py +0 -0
  359. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/string_test.py +0 -0
  360. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/sync_test.py +0 -0
  361. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/table_test.py +0 -0
  362. {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/tmp_test.py +0 -0
@@ -8,6 +8,10 @@ on:
8
8
  branches:
9
9
  - main
10
10
 
11
+ concurrency:
12
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
13
+ cancel-in-progress: true
14
+
11
15
  jobs:
12
16
  validate:
13
17
  runs-on: ubuntu-latest
@@ -138,27 +142,7 @@ jobs:
138
142
  needs: validate
139
143
  if: github.event_name == 'pull_request' && needs.validate.result == 'success'
140
144
  runs-on: ubuntu-latest
141
- strategy:
142
- fail-fast: false
143
- matrix:
144
- test-category:
145
- - name: "core"
146
- path: "tests/core,tests/utils"
147
- system-deps: false
148
- timeout: 15
149
- - name: "extractors"
150
- path: "tests/extractors"
151
- system-deps: true
152
- timeout: 20
153
- - name: "integration"
154
- path: "tests/integration,tests/api"
155
- system-deps: true
156
- timeout: 25
157
- - name: "features"
158
- path: "tests/features,tests/interfaces,tests/mcp,tests/multiprocessing,tests/ocr"
159
- system-deps: true
160
- timeout: 20
161
- timeout-minutes: ${{ matrix.test-category.timeout }}
145
+ timeout-minutes: 45
162
146
  steps:
163
147
  - name: Checkout
164
148
  uses: actions/checkout@v5
@@ -170,36 +154,62 @@ jobs:
170
154
 
171
155
  - name: Install Python
172
156
  uses: actions/setup-python@v6
157
+ id: setup-python
173
158
  with:
174
159
  python-version: "3.13"
175
160
 
176
161
  - name: Cache Python Dependencies
162
+ id: python-cache
177
163
  uses: actions/cache@v4
178
164
  with:
179
165
  path: |
180
166
  ~/.cache/uv
181
167
  .venv
182
- key: python-dependencies-ubuntu-latest-3.13-${{ matrix.test-category.name }}-${{ hashFiles('uv.lock') }}
168
+ key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
183
169
  restore-keys: |
184
170
  python-dependencies-ubuntu-latest-3.13-
185
171
 
186
172
  - name: Install Dependencies
187
- run: uv sync --all-extras --dev
173
+ uses: nick-fields/retry@v3
174
+ with:
175
+ timeout_minutes: 5
176
+ max_attempts: 3
177
+ retry_wait_seconds: 30
178
+ command: |
179
+ uv sync --all-extras --dev
180
+ shell: bash
188
181
 
189
182
  - name: Install System Dependencies
190
- if: matrix.test-category.system-deps
191
- run: |
192
- sudo apt-get update
193
- sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
183
+ uses: nick-fields/retry@v3
184
+ with:
185
+ timeout_minutes: 5
186
+ max_attempts: 3
187
+ retry_wait_seconds: 30
188
+ command: |
189
+ sudo apt-get update
190
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
191
+ shell: bash
194
192
 
195
- - name: Run Tests - ${{ matrix.test-category.name }}
196
- run: uv run pytest $(echo "${{ matrix.test-category.path }}" | tr ',' ' ') -v --reruns 1 --reruns-delay 1 --cov=kreuzberg --cov-append --cov-report=lcov:coverage-${{ matrix.test-category.name }}.lcov
193
+ - name: Run All Tests with Coverage
194
+ uses: nick-fields/retry@v3
195
+ with:
196
+ timeout_minutes: 15
197
+ max_attempts: 3
198
+ retry_wait_seconds: 10
199
+ command: |
200
+ uv run coverage erase
201
+ uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
202
+ uv run coverage report --precision=2
203
+ shell: bash
197
204
 
198
205
  - name: Upload Coverage Artifacts
206
+ if: always()
199
207
  uses: actions/upload-artifact@v4
200
208
  with:
201
- name: coverage-${{ matrix.test-category.name }}-${{ github.sha }}
202
- path: coverage-${{ matrix.test-category.name }}.lcov
209
+ name: coverage-pr-${{ github.sha }}
210
+ path: |
211
+ coverage.lcov
212
+ .coverage
203
213
  retention-days: 1
204
214
 
205
215
  coverage-pr:
@@ -214,47 +224,7 @@ jobs:
214
224
  - name: Download Coverage Artifacts
215
225
  uses: actions/download-artifact@v5
216
226
  with:
217
- pattern: coverage-*-${{ github.sha }}
218
- merge-multiple: true
219
-
220
- - name: Install uv
221
- uses: astral-sh/setup-uv@v6
222
- with:
223
- enable-cache: true
224
-
225
- - name: Install Python
226
- uses: actions/setup-python@v6
227
- with:
228
- python-version: "3.13"
229
-
230
- - name: Install Dependencies
231
- run: uv sync --dev
232
-
233
- - name: Combine Coverage Reports
234
- run: |
235
- # Install lcov for combining reports
236
- sudo apt-get update && sudo apt-get install -y lcov
237
-
238
- # List available coverage files
239
- echo "Available coverage files:"
240
- find . -name "coverage-*.lcov" -type f || echo "No coverage files found"
241
-
242
- # Combine all lcov files if they exist
243
- coverage_files=($(find . -name "coverage-*.lcov" -type f))
244
- if [ ${#coverage_files[@]} -gt 0 ]; then
245
- echo "Combining ${#coverage_files[@]} coverage files..."
246
- if [ ${#coverage_files[@]} -eq 1 ]; then
247
- # Only one file, just copy it
248
- cp "${coverage_files[0]}" coverage.lcov
249
- else
250
- # Multiple files, combine them
251
- lcov --rc branch_coverage=1 $(printf " -a %s" "${coverage_files[@]}") -o coverage.lcov
252
- fi
253
- else
254
- echo "No coverage files to combine, creating empty coverage.lcov"
255
- echo "TN:" > coverage.lcov
256
- echo "end_of_record" >> coverage.lcov
257
- fi
227
+ name: coverage-pr-${{ github.sha }}
258
228
 
259
229
  - name: Upload Coverage to DeepSource
260
230
  if: always()
@@ -4,6 +4,10 @@ on:
4
4
  workflow_dispatch:
5
5
  workflow_call:
6
6
 
7
+ concurrency:
8
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
9
+ cancel-in-progress: true
10
+
7
11
  jobs:
8
12
  test-docker-images:
9
13
  runs-on: ubuntu-latest
@@ -17,7 +17,7 @@ permissions:
17
17
 
18
18
  concurrency:
19
19
  group: "pages"
20
- cancel-in-progress: false
20
+ cancel-in-progress: true
21
21
 
22
22
  jobs:
23
23
  build:
@@ -3,6 +3,10 @@ name: Test Docker Builds (No Push)
3
3
  on:
4
4
  workflow_dispatch:
5
5
 
6
+ concurrency:
7
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
8
+ cancel-in-progress: true
9
+
6
10
  jobs:
7
11
  test-build-all-images:
8
12
  runs-on: ubuntu-latest
@@ -26,7 +26,7 @@ repos:
26
26
  hooks:
27
27
  - id: mdformat
28
28
  additional_dependencies:
29
- - mdformat-mkdocs==4.0.0
29
+ - mdformat-mkdocs==4.1.0
30
30
  - repo: https://github.com/igorshubovych/markdownlint-cli
31
31
  rev: v0.45.0
32
32
  hooks:
@@ -36,6 +36,7 @@ repos:
36
36
  hooks:
37
37
  - id: blacken-docs
38
38
  args: ["--pyi", "--line-length", "130"]
39
+ exclude: tests/features/token_reduction_test.py
39
40
  additional_dependencies:
40
41
  - black==25.1.0
41
42
  - repo: https://github.com/rbubley/mirrors-prettier
@@ -48,7 +49,7 @@ repos:
48
49
  hooks:
49
50
  - id: pyproject-fmt
50
51
  - repo: https://github.com/astral-sh/ruff-pre-commit
51
- rev: v0.13.0
52
+ rev: v0.13.1
52
53
  hooks:
53
54
  - id: ruff
54
55
  args: ["--fix", "--unsafe-fixes"]
@@ -59,7 +60,7 @@ repos:
59
60
  - id: codespell
60
61
  exclude: ^tests|^scripts|^kreuzberg/_tesseract|^kreuzberg/_mime_types
61
62
  additional_dependencies:
62
- - tomli
63
+ - tomli==2.2.1
63
64
  - repo: https://github.com/jsh9/pydoclint
64
65
  rev: 0.7.3
65
66
  hooks:
@@ -0,0 +1 @@
1
+ ATTRIBUTIONS.md
@@ -0,0 +1,47 @@
1
+ # Third-Party Attributions
2
+
3
+ This file contains attributions for third-party code, data, and libraries used in Kreuzberg.
4
+
5
+ ## Stopwords Data
6
+
7
+ The stopwords data in `kreuzberg/_token_reduction/stop_words.json` is derived from the [stopwords-iso](https://github.com/stopwords-iso/stopwords-iso) project.
8
+
9
+ **Original Author:** Gene Diaz and contributors
10
+ **License:** MIT License
11
+ **Source:** <https://github.com/stopwords-iso/stopwords-iso>
12
+
13
+ ### MIT License (stopwords-iso)
14
+
15
+ ```text
16
+ MIT License
17
+
18
+ Copyright (c) stopwords-iso contributors
19
+
20
+ Permission is hereby granted, free of charge, to any person obtaining a copy
21
+ of this software and associated documentation files (the "Software"), to deal
22
+ in the Software without restriction, including without limitation the rights
23
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
24
+ copies of the Software, and to permit persons to whom the Software is
25
+ furnished to do so, subject to the following conditions:
26
+
27
+ The above copyright notice and this permission notice shall be included in all
28
+ copies or substantial portions of the Software.
29
+
30
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36
+ SOFTWARE.
37
+ ```
38
+
39
+ ### Changes Made
40
+
41
+ The original stopwords-iso data was used as-is with no modifications to the word lists themselves. The data was packaged into Kreuzberg's `_token_reduction` module for use in the token reduction feature.
42
+
43
+ ______________________________________________________________________
44
+
45
+ ## Other Third-Party Dependencies
46
+
47
+ All other third-party dependencies are listed in `pyproject.toml` with their respective licenses. This section is specifically for bundled/vendored code and data.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.16.0
3
+ Version: 3.17.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -32,12 +32,13 @@ Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.13.0
35
- Requires-Dist: mcp>=1.14.0
35
+ Requires-Dist: langcodes>=3.5.0
36
+ Requires-Dist: mcp>=1.14.1
36
37
  Requires-Dist: msgspec>=0.18.0
37
38
  Requires-Dist: numpy>=2.0.0
38
39
  Requires-Dist: playa-pdf>=0.7.0
39
40
  Requires-Dist: polars>=1.33.1
40
- Requires-Dist: psutil>=7.0.0
41
+ Requires-Dist: psutil>=7.1.0
41
42
  Requires-Dist: pypdfium2==4.30.0
42
43
  Requires-Dist: python-calamine>=0.5.3
43
44
  Requires-Dist: python-pptx>=1.0.2
@@ -49,7 +50,7 @@ Provides-Extra: all
49
50
  Requires-Dist: click>=8.2.1; extra == 'all'
50
51
  Requires-Dist: deep-translator>=1.11.4; extra == 'all'
51
52
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
52
- Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
53
+ Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
53
54
  Requires-Dist: gmft>=0.4.2; extra == 'all'
54
55
  Requires-Dist: keybert>=0.9.0; extra == 'all'
55
56
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
@@ -82,7 +83,7 @@ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
82
83
  Provides-Extra: gmft
83
84
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
84
85
  Provides-Extra: langdetect
85
- Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
86
+ Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
86
87
  Provides-Extra: paddleocr
87
88
  Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
88
89
  Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import statistics
5
+ import time
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from typing import Any, Literal
9
+
10
+ from kreuzberg import extract_bytes_sync
11
+ from kreuzberg._token_reduction import get_reduction_stats, reduce_tokens
12
+ from kreuzberg._types import ExtractionConfig, TokenReductionConfig
13
+
14
+
15
+ @dataclass
16
+ class CompressionResult:
17
+ text_type: str
18
+ mode: str
19
+ original_length: int
20
+ reduced_length: int
21
+ original_tokens: int
22
+ reduced_tokens: int
23
+ character_reduction_ratio: float
24
+ token_reduction_ratio: float
25
+ processing_time_ms: float
26
+
27
+ @property
28
+ def character_compression_percent(self) -> float:
29
+ return self.character_reduction_ratio * 100
30
+
31
+ @property
32
+ def token_compression_percent(self) -> float:
33
+ return self.token_reduction_ratio * 100
34
+
35
+
36
+ @dataclass
37
+ class CompressionBenchmarkSuite:
38
+ results: list[CompressionResult] = field(default_factory=list)
39
+ total_tests: int = 0
40
+ total_time_ms: float = 0.0
41
+
42
+ def add_result(self, result: CompressionResult) -> None:
43
+ self.results.append(result)
44
+ self.total_tests += 1
45
+ self.total_time_ms += result.processing_time_ms
46
+
47
+ def get_summary(self) -> dict[str, Any]:
48
+ if not self.results:
49
+ return {}
50
+
51
+ by_mode: dict[str, Any] = {}
52
+ for result in self.results:
53
+ if result.mode not in by_mode:
54
+ by_mode[result.mode] = []
55
+ by_mode[result.mode].append(result)
56
+
57
+ mode_stats = {}
58
+ for mode, results in by_mode.items():
59
+ char_ratios = [r.character_compression_percent for r in results]
60
+ token_ratios = [r.token_compression_percent for r in results]
61
+ times = [r.processing_time_ms for r in results]
62
+
63
+ mode_stats[mode] = {
64
+ "tests": len(results),
65
+ "character_compression": {
66
+ "mean": statistics.mean(char_ratios),
67
+ "median": statistics.median(char_ratios),
68
+ "stdev": statistics.stdev(char_ratios) if len(char_ratios) > 1 else 0.0,
69
+ "min": min(char_ratios),
70
+ "max": max(char_ratios),
71
+ },
72
+ "token_compression": {
73
+ "mean": statistics.mean(token_ratios),
74
+ "median": statistics.median(token_ratios),
75
+ "stdev": statistics.stdev(token_ratios) if len(token_ratios) > 1 else 0.0,
76
+ "min": min(token_ratios),
77
+ "max": max(token_ratios),
78
+ },
79
+ "performance": {
80
+ "avg_time_ms": statistics.mean(times),
81
+ "total_time_ms": sum(times),
82
+ },
83
+ }
84
+
85
+ return {
86
+ "summary": {
87
+ "total_tests": self.total_tests,
88
+ "total_time_ms": self.total_time_ms,
89
+ "avg_time_per_test_ms": self.total_time_ms / self.total_tests if self.total_tests > 0 else 0.0,
90
+ },
91
+ "by_mode": mode_stats,
92
+ "detailed_results": [
93
+ {
94
+ "text_type": r.text_type,
95
+ "mode": r.mode,
96
+ "original_length": r.original_length,
97
+ "reduced_length": r.reduced_length,
98
+ "original_tokens": r.original_tokens,
99
+ "reduced_tokens": r.reduced_tokens,
100
+ "character_compression_percent": r.character_compression_percent,
101
+ "token_compression_percent": r.token_compression_percent,
102
+ "processing_time_ms": r.processing_time_ms,
103
+ }
104
+ for r in self.results
105
+ ],
106
+ }
107
+
108
+
109
+ class TokenReductionCompressionBenchmark:
110
+ def __init__(self) -> None:
111
+ self.test_texts = self._create_test_texts()
112
+ self.modes: list[Literal["light", "moderate"]] = ["light", "moderate"]
113
+
114
+ def _create_test_texts(self) -> dict[str, str]:
115
+ return {
116
+ "formal_document": """
117
+ The quarterly financial report demonstrates significant improvements in operational efficiency and market positioning.
118
+ Our comprehensive analysis reveals that the implementation of strategic initiatives has resulted in measurable outcomes
119
+ across multiple key performance indicators. The organization's commitment to excellence and continuous improvement
120
+ is evident in these results. Furthermore, the systematic approach to risk management and quality assurance has
121
+ contributed to enhanced stakeholder confidence and sustainable growth trajectory. The board of directors acknowledges
122
+ the exceptional efforts of the management team and all employees in achieving these remarkable results.
123
+ """.strip(),
124
+ "casual_conversation": """
125
+ Hey there! I was just thinking about that amazing movie we watched last weekend. It was really incredible, wasn't it?
126
+ The way they told the story was so compelling and the characters were just wonderful. I think it's one of the best
127
+ films I've seen this year. What did you think about it? I'd love to hear your thoughts and maybe we could discuss
128
+ some of the themes that really stood out to me. There were so many interesting elements that I'm still thinking about.
129
+ """.strip(),
130
+ "technical_manual": """
131
+ Configure the system parameters by accessing the administrative interface through the main configuration panel.
132
+ Navigate to Settings > Advanced > Network Configuration and verify that all connection parameters are correctly
133
+ initialized. The TCP/IP stack must be properly configured with appropriate DNS resolution settings and gateway
134
+ routing tables. Execute the diagnostic utilities to validate network connectivity and ensure that all protocols
135
+ are functioning within acceptable performance thresholds. Document any configuration changes in the system log
136
+ for future reference and troubleshooting procedures.
137
+ """.strip(),
138
+ "news_article": """
139
+ Local authorities announced today that the new public transportation system will begin operations next month,
140
+ connecting several major districts across the metropolitan area. The project, which has been in development for
141
+ over three years, represents a significant investment in sustainable urban infrastructure. City officials expect
142
+ the system to reduce traffic congestion and provide affordable transportation options for thousands of daily
143
+ commuters. Environmental impact studies indicate that the implementation will contribute to reduced carbon emissions
144
+ and improved air quality throughout the region.
145
+ """.strip(),
146
+ "literature_excerpt": """
147
+ The old lighthouse stood majestically against the stormy horizon, its weathered stones bearing witness to countless
148
+ tempests and countless ships that had sought its guiding light. Sarah approached the ancient structure with a sense
149
+ of reverence, knowing that within its walls lay the stories of generations of lighthouse keepers who had dedicated
150
+ their lives to the safety of maritime travelers. The wind howled through the nearby cliffs, carrying with it the
151
+ salt spray of crashing waves and the whispered secrets of the sea itself.
152
+ """.strip(),
153
+ "scientific_abstract": """
154
+ This study investigates the relationship between cognitive load and working memory performance in multilingual
155
+ individuals under various experimental conditions. Participants (n=127) completed a series of standardized
156
+ assessments while neural activity was monitored using electroencephalography. Results indicate significant
157
+ correlations between language switching frequency and executive control efficiency (p<0.001). The findings suggest
158
+ that bilingual advantages in cognitive flexibility extend to domain-general executive functions, with implications
159
+ for educational policy and cognitive training interventions.
160
+ """.strip(),
161
+ "stopword_heavy": """
162
+ And so it was that he went to the store, and then he bought some things that he needed for the house. But when
163
+ he got back to the place where he lived, he realized that he had forgotten to get the most important thing that
164
+ he had originally planned to purchase. So he had to go back to the store again, and this time he made sure to
165
+ get everything that he needed. It was a bit frustrating, but in the end, everything worked out just fine.
166
+ """.strip(),
167
+ "technical_jargon": """
168
+ The microservices architecture implements a distributed system pattern utilizing containerized deployments
169
+ orchestrated through Kubernetes clusters. API gateways facilitate service discovery and load balancing across
170
+ multiple availability zones. The event-driven messaging infrastructure leverages Apache Kafka for asynchronous
171
+ communication between bounded contexts. Monitoring and observability are achieved through OpenTelemetry
172
+ instrumentation with Prometheus metrics collection and Grafana visualization dashboards.
173
+ """.strip(),
174
+ "minimal_stopwords": """
175
+ Python programming language offers powerful features. Machine learning algorithms require extensive datasets.
176
+ Neural networks demonstrate remarkable performance capabilities. Developers utilize frameworks like TensorFlow.
177
+ Data preprocessing involves cleaning, transformation, validation procedures. Model training requires computational
178
+ resources, optimization techniques. Evaluation metrics include accuracy, precision, recall measurements.
179
+ Production deployment considerations encompass scalability, monitoring, maintenance requirements.
180
+ """.strip(),
181
+ }
182
+
183
+ def test_compression_effectiveness(
184
+ self, text: str, text_type: str, mode: Literal["light", "moderate"]
185
+ ) -> CompressionResult:
186
+ config = TokenReductionConfig(mode=mode, preserve_markdown=False)
187
+
188
+ start_time = time.perf_counter()
189
+ reduced_text = reduce_tokens(text, config=config, language="en")
190
+ processing_time = (time.perf_counter() - start_time) * 1000
191
+
192
+ stats = get_reduction_stats(text, reduced_text)
193
+
194
+ return CompressionResult(
195
+ text_type=text_type,
196
+ mode=mode,
197
+ original_length=len(text),
198
+ reduced_length=len(reduced_text),
199
+ original_tokens=stats["original_tokens"],
200
+ reduced_tokens=stats["reduced_tokens"],
201
+ character_reduction_ratio=stats["character_reduction_ratio"],
202
+ token_reduction_ratio=stats["token_reduction_ratio"],
203
+ processing_time_ms=processing_time,
204
+ )
205
+
206
+ def run_comprehensive_benchmark(self) -> CompressionBenchmarkSuite:
207
+ suite = CompressionBenchmarkSuite()
208
+
209
+ for text_type, text in self.test_texts.items():
210
+ for mode in self.modes:
211
+ result = self.test_compression_effectiveness(text, text_type, mode)
212
+ suite.add_result(result)
213
+
214
+ return suite
215
+
216
+ def run_pipeline_integration_test(self) -> dict[str, Any]:
217
+ pipeline_results: dict[str, Any] = {}
218
+
219
+ for mode in self.modes:
220
+ config = ExtractionConfig(token_reduction=TokenReductionConfig(mode=mode))
221
+
222
+ test_text = self.test_texts["formal_document"]
223
+
224
+ start_time = time.perf_counter()
225
+ result = extract_bytes_sync(test_text.encode("utf-8"), "text/plain", config)
226
+ processing_time = (time.perf_counter() - start_time) * 1000
227
+
228
+ reduction_stats = result.metadata.get("token_reduction", {})
229
+
230
+ pipeline_results[mode] = {
231
+ "original_length": len(test_text),
232
+ "reduced_length": len(result.content),
233
+ "processing_time_ms": processing_time,
234
+ "reduction_stats": reduction_stats,
235
+ "metadata_present": "token_reduction" in result.metadata,
236
+ }
237
+
238
+ return pipeline_results
239
+
240
+
241
+ def main() -> None:
242
+ benchmark = TokenReductionCompressionBenchmark()
243
+
244
+ suite = benchmark.run_comprehensive_benchmark()
245
+
246
+ pipeline_results = benchmark.run_pipeline_integration_test()
247
+
248
+ summary = suite.get_summary()
249
+
250
+ for _stats in summary["by_mode"].values():
251
+ pass
252
+
253
+ output_dir = Path("benchmarks/results")
254
+ output_dir.mkdir(exist_ok=True)
255
+
256
+ full_results = {
257
+ "compression_benchmark": summary,
258
+ "pipeline_integration": pipeline_results,
259
+ "timestamp": time.time(),
260
+ }
261
+
262
+ output_file = output_dir / "token_reduction_compression.json"
263
+ with output_file.open("w") as f:
264
+ json.dump(full_results, f, indent=2)
265
+
266
+
267
+ if __name__ == "__main__":
268
+ main()
@@ -84,6 +84,12 @@ Configuration options for converting HTML content to Markdown:
84
84
 
85
85
  ::: kreuzberg.HTMLToMarkdownConfig
86
86
 
87
+ ## Token Reduction Configuration
88
+
89
+ Configuration options for token reduction and text optimization:
90
+
91
+ ::: kreuzberg.TokenReductionConfig
92
+
87
93
  ## PSMMode (Page Segmentation Mode)
88
94
 
89
95
  ::: kreuzberg.PSMMode
@@ -8,6 +8,7 @@ This guide provides comprehensive documentation for the Kreuzberg document intel
8
8
  - [Extraction Configuration](extraction-configuration.md) - Configure the extraction process ([API](../api-reference/types.md#extractionconfig))
9
9
  - [Metadata Extraction](metadata-extraction.md) - Document metadata extraction ([API](../api-reference/types.md#metadata))
10
10
  - [Content Chunking](chunking.md) - Split documents into manageable chunks
11
+ - [Token Reduction](token-reduction.md) - Optimize text for LLMs and storage ([API](../api-reference/types.md#tokenreductionconfig))
11
12
  - [Document Classification](document-classification.md) - Automatic document type detection
12
13
  - [OCR Configuration](ocr-configuration.md) - Configure OCR settings ([API](../api-reference/ocr-configuration.md))
13
14
  - [OCR Backends](ocr-backends.md) - Choose and configure different OCR engines