kreuzberg 3.13.3__tar.gz → 3.14.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/ci.yaml +179 -16
  2. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/docker-e2e-tests.yml +1 -1
  3. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/publish-docker.yml +1 -0
  4. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.pre-commit-config.yaml +1 -7
  5. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/PKG-INFO +4 -4
  6. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/ai-rulez.yaml +73 -7
  7. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/examples/extraction-examples.md +1 -1
  8. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/api-server.md +59 -0
  9. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/extraction-configuration.md +75 -0
  10. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/ocr-configuration.md +65 -1
  11. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_api/main.py +126 -18
  12. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_config.py +0 -1
  13. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_image.py +20 -2
  14. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_pdf.py +21 -1
  15. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_spread_sheet.py +0 -1
  16. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_gmft.py +79 -33
  17. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_mcp/server.py +0 -76
  18. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_base.py +1 -2
  19. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_paddleocr.py +39 -13
  20. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_tesseract.py +2 -3
  21. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_registry.py +26 -0
  22. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_types.py +66 -3
  23. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_cache.py +34 -12
  24. kreuzberg-3.14.1/kreuzberg/_utils/_image_preprocessing.py +346 -0
  25. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_ocr_cache.py +2 -5
  26. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_process_pool.py +3 -3
  27. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_table.py +4 -1
  28. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/cli.py +19 -2
  29. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/extraction.py +4 -4
  30. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/pyproject.toml +7 -7
  31. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/main_test.py +36 -2
  32. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/conftest.py +99 -0
  33. kreuzberg-3.14.1/tests/core/dpi_configuration_test.py +353 -0
  34. kreuzberg-3.14.1/tests/core/html_to_markdown_config_test.py +0 -0
  35. kreuzberg-3.14.1/tests/core/mime_types_test.py +0 -0
  36. kreuzberg-3.14.1/tests/core/registry_test.py +0 -0
  37. kreuzberg-3.14.1/tests/core/types_test.py +0 -0
  38. kreuzberg-3.14.1/tests/e2e/__init__.py +0 -0
  39. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/e2e/docker_e2e_test.py +4 -4
  40. kreuzberg-3.14.1/tests/extractors/__init__.py +0 -0
  41. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/html_test.py +1 -1
  42. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/image_test.py +7 -3
  43. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/pandoc_test.py +1 -0
  44. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/pdf_test.py +7 -22
  45. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/presentation_test.py +1 -1
  46. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/spreed_sheet_test.py +4 -0
  47. kreuzberg-3.14.1/tests/features/__init__.py +0 -0
  48. kreuzberg-3.14.1/tests/features/chunker_test.py +0 -0
  49. kreuzberg-3.14.1/tests/features/document_classification_test.py +0 -0
  50. kreuzberg-3.14.1/tests/features/entity_extraction_test.py +0 -0
  51. kreuzberg-3.14.1/tests/features/gmft_test.py +0 -0
  52. kreuzberg-3.14.1/tests/features/hooks_test.py +0 -0
  53. kreuzberg-3.14.1/tests/features/language_detection_test.py +0 -0
  54. kreuzberg-3.14.1/tests/integration/__init__.py +0 -0
  55. kreuzberg-3.14.1/tests/integration/api/__init__.py +0 -0
  56. kreuzberg-3.14.1/tests/integration/api/large_file_test.py +0 -0
  57. kreuzberg-3.14.1/tests/integration/api/mounted_config_test.py +0 -0
  58. kreuzberg-3.14.1/tests/integration/dpi_integration_test.py +244 -0
  59. kreuzberg-3.14.1/tests/integration/multiprocessing/__init__.py +0 -0
  60. kreuzberg-3.14.1/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  61. kreuzberg-3.14.1/tests/integration/ocr/__init__.py +0 -0
  62. kreuzberg-3.14.1/tests/integration/ocr/device_integration_test.py +0 -0
  63. kreuzberg-3.14.1/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  64. kreuzberg-3.14.1/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  65. kreuzberg-3.14.1/tests/integration/regression_test.py +134 -0
  66. kreuzberg-3.14.1/tests/interfaces/__init__.py +0 -0
  67. kreuzberg-3.14.1/tests/mcp/__init__.py +0 -0
  68. kreuzberg-3.14.1/tests/mcp/mcp_server_test.py +0 -0
  69. kreuzberg-3.14.1/tests/multiprocessing/__init__.py +0 -0
  70. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/multiprocessing/gmft_isolated_test.py +54 -58
  71. kreuzberg-3.14.1/tests/ocr/__init__.py +0 -0
  72. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/easyocr_test.py +1 -1
  73. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/paddleocr_test.py +7 -5
  74. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/tesseract_test.py +1 -1
  75. kreuzberg-3.14.1/tests/test_source_files/sharable-web-guide.pdf +0 -0
  76. kreuzberg-3.14.1/tests/utils/__init__.py +0 -0
  77. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/device_test.py +1 -1
  78. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/ocr_cache_test.py +7 -6
  79. kreuzberg-3.14.1/tests/utils/playa_helpers_test.py +0 -0
  80. kreuzberg-3.14.1/tests/utils/playa_test.py +0 -0
  81. kreuzberg-3.14.1/tests/utils/quality_test.py +121 -0
  82. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/serialization_test.py +1 -1
  83. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/table_test.py +6 -6
  84. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/tmp_test.py +1 -1
  85. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/uv.lock +297 -286
  86. kreuzberg-3.13.3/tests/api/large_file_test.py +0 -184
  87. kreuzberg-3.13.3/tests/api/mounted_config_test.py +0 -184
  88. kreuzberg-3.13.3/tests/chunker_test.py +0 -102
  89. kreuzberg-3.13.3/tests/cli_command_test.py +0 -481
  90. kreuzberg-3.13.3/tests/cli_integration_test.py +0 -858
  91. kreuzberg-3.13.3/tests/cli_test.py +0 -324
  92. kreuzberg-3.13.3/tests/config_test.py +0 -1540
  93. kreuzberg-3.13.3/tests/document_classification_test.py +0 -837
  94. kreuzberg-3.13.3/tests/entity_extraction_test.py +0 -588
  95. kreuzberg-3.13.3/tests/exceptions_test.py +0 -91
  96. kreuzberg-3.13.3/tests/extraction_batch_test.py +0 -253
  97. kreuzberg-3.13.3/tests/extraction_test.py +0 -752
  98. kreuzberg-3.13.3/tests/gmft_extended_test.py +0 -137
  99. kreuzberg-3.13.3/tests/gmft_test.py +0 -788
  100. kreuzberg-3.13.3/tests/hooks_test.py +0 -205
  101. kreuzberg-3.13.3/tests/html_to_markdown_config_test.py +0 -217
  102. kreuzberg-3.13.3/tests/language_detection_test.py +0 -152
  103. kreuzberg-3.13.3/tests/mcp_server_test.py +0 -757
  104. kreuzberg-3.13.3/tests/mime_types_test.py +0 -195
  105. kreuzberg-3.13.3/tests/multiprocessing/gmft_integration_test.py +0 -98
  106. kreuzberg-3.13.3/tests/ocr/device_integration_test.py +0 -268
  107. kreuzberg-3.13.3/tests/ocr/tesseract_tsv_integration_test.py +0 -273
  108. kreuzberg-3.13.3/tests/playa_helpers_test.py +0 -473
  109. kreuzberg-3.13.3/tests/playa_test.py +0 -111
  110. kreuzberg-3.13.3/tests/registry_test.py +0 -190
  111. kreuzberg-3.13.3/tests/regression_api_test.py +0 -81
  112. kreuzberg-3.13.3/tests/regression_test.py +0 -159
  113. kreuzberg-3.13.3/tests/regression_with_config_test.py +0 -145
  114. kreuzberg-3.13.3/tests/tesseract_sync_formats_test.py +0 -169
  115. kreuzberg-3.13.3/tests/types_test.py +0 -374
  116. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.commitlintrc +0 -0
  117. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.deepsource.toml +0 -0
  118. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.docker/Dockerfile +0 -0
  119. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.docker/README.md +0 -0
  120. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.dockerignore +0 -0
  121. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/dependabot.yaml +0 -0
  122. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/docs.yml +0 -0
  123. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/pr-title.yaml +0 -0
  124. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/release.yaml +0 -0
  125. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/test-docker-builds.yml +0 -0
  126. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.gitignore +0 -0
  127. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.markdownlint.yaml +0 -0
  128. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/LICENSE +0 -0
  129. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/README.md +0 -0
  130. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/Taskfile.yml +0 -0
  131. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/README.md +0 -0
  132. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/__init__.py +0 -0
  133. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/pyproject.toml +0 -0
  134. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/__init__.py +0 -0
  135. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/__main__.py +0 -0
  136. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/benchmarks.py +0 -0
  137. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/cli.py +0 -0
  138. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/models.py +0 -0
  139. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/profiler.py +0 -0
  140. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/runner.py +0 -0
  141. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docker-compose.example.yml +0 -0
  142. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docker-logs/docker-info.txt +0 -0
  143. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docker-logs/docker-version.txt +0 -0
  144. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/custom-extractors.md +0 -0
  145. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/custom-hooks.md +0 -0
  146. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/error-handling.md +0 -0
  147. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/index.md +0 -0
  148. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/performance.md +0 -0
  149. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/exceptions.md +0 -0
  150. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/extraction-functions.md +0 -0
  151. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/extractor-registry.md +0 -0
  152. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/index.md +0 -0
  153. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/ocr-configuration.md +0 -0
  154. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/types.md +0 -0
  155. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/assets/favicon.png +0 -0
  156. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/assets/logo.png +0 -0
  157. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/cli.md +0 -0
  158. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/contributing.md +0 -0
  159. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/css/extra.css +0 -0
  160. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/examples/index.md +0 -0
  161. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/getting-started/index.md +0 -0
  162. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/getting-started/installation.md +0 -0
  163. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/getting-started/quick-start.md +0 -0
  164. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/index.md +0 -0
  165. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/basic-usage.md +0 -0
  166. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/chunking.md +0 -0
  167. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/docker.md +0 -0
  168. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/document-classification.md +0 -0
  169. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/index.md +0 -0
  170. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/mcp-server.md +0 -0
  171. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/metadata-extraction.md +0 -0
  172. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/ocr-backends.md +0 -0
  173. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/supported-formats.md +0 -0
  174. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/__init__.py +0 -0
  175. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/__main__.py +0 -0
  176. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_api/__init__.py +0 -0
  177. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_chunker.py +0 -0
  178. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_constants.py +0 -0
  179. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_document_classification.py +0 -0
  180. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_entity_extraction.py +0 -0
  181. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/__init__.py +0 -0
  182. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_base.py +0 -0
  183. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_email.py +0 -0
  184. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_html.py +0 -0
  185. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  186. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_presentation.py +0 -0
  187. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_structured.py +0 -0
  188. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_language_detection.py +0 -0
  189. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_mcp/__init__.py +0 -0
  190. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_mime_types.py +0 -0
  191. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/__init__.py +0 -0
  192. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_easyocr.py +0 -0
  193. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
  194. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_playa.py +0 -0
  195. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/__init__.py +0 -0
  196. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_device.py +0 -0
  197. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_document_cache.py +0 -0
  198. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_errors.py +0 -0
  199. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  200. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_quality.py +0 -0
  201. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_ref.py +0 -0
  202. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_serialization.py +0 -0
  203. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_string.py +0 -0
  204. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_sync.py +0 -0
  205. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_tmp.py +0 -0
  206. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/exceptions.py +0 -0
  207. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/py.typed +0 -0
  208. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/mkdocs.yaml +0 -0
  209. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/output.txt +0 -0
  210. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/results/baseline.json +0 -0
  211. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/results/serialization.json +0 -0
  212. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/results/statistical.json +0 -0
  213. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/test_report.json +0 -0
  214. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/__init__.py +0 -0
  215. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/__init__.py +0 -0
  216. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/conftest.py +0 -0
  217. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/runtime_config_test.py +0 -0
  218. {kreuzberg-3.13.3/tests/e2e → kreuzberg-3.14.1/tests/core}/__init__.py +0 -0
  219. /kreuzberg-3.13.3/tests/extractors/__init__.py → /kreuzberg-3.14.1/tests/core/config_test.py +0 -0
  220. /kreuzberg-3.13.3/tests/multiprocessing/__init__.py → /kreuzberg-3.14.1/tests/core/exceptions_test.py +0 -0
  221. /kreuzberg-3.13.3/tests/ocr/__init__.py → /kreuzberg-3.14.1/tests/core/extraction_batch_test.py +0 -0
  222. /kreuzberg-3.13.3/tests/utils/__init__.py → /kreuzberg-3.14.1/tests/core/extraction_test.py +0 -0
  223. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/email_test.py +0 -0
  224. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  225. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/structured_test.py +0 -0
  226. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/multiprocessing/process_manager_test.py +0 -0
  227. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  228. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/base_test.py +0 -0
  229. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/init_test.py +0 -0
  230. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/tesseract_tsv_test.py +0 -0
  231. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  232. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/contract.txt +0 -0
  233. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/contract_test.txt +0 -0
  234. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/document.docx +0 -0
  235. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/email/sample-email.eml +0 -0
  236. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  237. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/excel.xlsx +0 -0
  238. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/flower-no-text.jpg +0 -0
  239. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/form_test.txt +0 -0
  240. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/french-text.txt +0 -0
  241. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/german-text.txt +0 -0
  242. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/google-doc-document.pdf +0 -0
  243. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/html.html +0 -0
  244. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/images/test_hello_world.png +0 -0
  245. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/invoice_image.png +0 -0
  246. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/invoice_test.txt +0 -0
  247. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/json/sample-document.json +0 -0
  248. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  249. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/markdown.md +0 -0
  250. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  251. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/non-searchable.pdf +0 -0
  252. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/ocr-image.jpg +0 -0
  253. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  254. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  255. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  256. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  257. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/receipt_test.txt +0 -0
  258. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/report_test.txt +0 -0
  259. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/sample-contract.pdf +0 -0
  260. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/scanned.pdf +0 -0
  261. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/searchable.pdf +0 -0
  262. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/spanish-text.txt +0 -0
  263. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/tables/borderless_table.png +0 -0
  264. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/tables/complex_document.png +0 -0
  265. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/tables/simple_table.png +0 -0
  266. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/test-article.pdf +0 -0
  267. /kreuzberg-3.13.3/tests/test_source_files/testXls.xls → /kreuzberg-3.14.1/tests/test_source_files/test-excel.xls +0 -0
  268. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  269. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/cache_test.py +0 -0
  270. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/errors_test.py +0 -0
  271. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/pdf_lock_test.py +0 -0
  272. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/process_pool_test.py +0 -0
  273. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/ref_test.py +0 -0
  274. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/string_test.py +0 -0
  275. {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/sync_test.py +0 -0
@@ -7,7 +7,6 @@ on:
7
7
  push:
8
8
  branches:
9
9
  - main
10
- - feat/smart-multiprocessing
11
10
 
12
11
  jobs:
13
12
  validate:
@@ -38,7 +37,7 @@ jobs:
38
37
  echo "Removing existing .venv directory on Windows"
39
38
  rm -rf .venv
40
39
  fi
41
- uv sync --all-packages --all-extras --dev
40
+ uv sync --all-extras --dev
42
41
  shell: bash
43
42
 
44
43
  - name: Load Cached Pre-Commit Dependencies
@@ -53,6 +52,7 @@ jobs:
53
52
 
54
53
  coverage:
55
54
  needs: validate
55
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
56
56
  runs-on: ubuntu-latest
57
57
  timeout-minutes: 120
58
58
  steps:
@@ -88,7 +88,7 @@ jobs:
88
88
  max_attempts: 3
89
89
  retry_wait_seconds: 30
90
90
  command: |
91
- uv sync --all-packages --all-extras --dev
91
+ uv sync --all-extras --dev
92
92
  shell: bash
93
93
 
94
94
  - name: Install System Dependencies
@@ -115,7 +115,7 @@ jobs:
115
115
  shell: bash
116
116
 
117
117
  - name: Upload Coverage to DeepSource
118
- if: always() && github.event_name == 'push'
118
+ if: always()
119
119
  env:
120
120
  DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
121
121
  run: |
@@ -134,15 +134,178 @@ jobs:
134
134
  .coverage
135
135
  retention-days: 7
136
136
 
137
- test:
138
- needs: coverage
137
+ test-pr:
138
+ needs: validate
139
+ if: github.event_name == 'pull_request' && needs.validate.result == 'success'
140
+ runs-on: ubuntu-latest
141
+ strategy:
142
+ fail-fast: false
143
+ matrix:
144
+ test-category:
145
+ - name: "core"
146
+ path: "tests/core,tests/utils"
147
+ system-deps: false
148
+ timeout: 15
149
+ - name: "extractors"
150
+ path: "tests/extractors"
151
+ system-deps: true
152
+ timeout: 20
153
+ - name: "integration"
154
+ path: "tests/integration,tests/api"
155
+ system-deps: true
156
+ timeout: 25
157
+ - name: "features"
158
+ path: "tests/features,tests/interfaces,tests/mcp,tests/multiprocessing,tests/ocr"
159
+ system-deps: true
160
+ timeout: 20
161
+ timeout-minutes: ${{ matrix.test-category.timeout }}
162
+ steps:
163
+ - name: Checkout
164
+ uses: actions/checkout@v5
165
+
166
+ - name: Install uv
167
+ uses: astral-sh/setup-uv@v6
168
+ with:
169
+ enable-cache: true
170
+
171
+ - name: Install Python
172
+ uses: actions/setup-python@v6
173
+ with:
174
+ python-version: "3.13"
175
+
176
+ - name: Cache Python Dependencies
177
+ uses: actions/cache@v4
178
+ with:
179
+ path: |
180
+ ~/.cache/uv
181
+ .venv
182
+ key: python-dependencies-ubuntu-latest-3.13-${{ matrix.test-category.name }}-${{ hashFiles('uv.lock') }}
183
+ restore-keys: |
184
+ python-dependencies-ubuntu-latest-3.13-
185
+
186
+ - name: Install Dependencies
187
+ run: uv sync --all-extras --dev
188
+
189
+ - name: Install System Dependencies
190
+ if: matrix.test-category.system-deps
191
+ run: |
192
+ sudo apt-get update
193
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
194
+
195
+ - name: Run Tests - ${{ matrix.test-category.name }}
196
+ run: uv run pytest $(echo "${{ matrix.test-category.path }}" | tr ',' ' ') -v --reruns 1 --reruns-delay 1 --cov=kreuzberg --cov-append --cov-report=lcov:coverage-${{ matrix.test-category.name }}.lcov
197
+
198
+ - name: Upload Coverage Artifacts
199
+ uses: actions/upload-artifact@v4
200
+ with:
201
+ name: coverage-${{ matrix.test-category.name }}-${{ github.sha }}
202
+ path: coverage-${{ matrix.test-category.name }}.lcov
203
+ retention-days: 1
204
+
205
+ coverage-pr:
206
+ needs: test-pr
207
+ if: github.event_name == 'pull_request' && always()
208
+ runs-on: ubuntu-latest
209
+ timeout-minutes: 10
210
+ steps:
211
+ - name: Checkout
212
+ uses: actions/checkout@v5
213
+
214
+ - name: Download Coverage Artifacts
215
+ uses: actions/download-artifact@v4
216
+ with:
217
+ pattern: coverage-*-${{ github.sha }}
218
+ merge-multiple: true
219
+
220
+ - name: Install uv
221
+ uses: astral-sh/setup-uv@v6
222
+ with:
223
+ enable-cache: true
224
+
225
+ - name: Install Python
226
+ uses: actions/setup-python@v6
227
+ with:
228
+ python-version: "3.13"
229
+
230
+ - name: Install Dependencies
231
+ run: uv sync --dev
232
+
233
+ - name: Combine Coverage Reports
234
+ run: |
235
+ # Install lcov for combining reports
236
+ sudo apt-get update && sudo apt-get install -y lcov
237
+
238
+ # List available coverage files
239
+ echo "Available coverage files:"
240
+ find . -name "coverage-*.lcov" -type f || echo "No coverage files found"
241
+
242
+ # Combine all lcov files if they exist
243
+ coverage_files=($(find . -name "coverage-*.lcov" -type f))
244
+ if [ ${#coverage_files[@]} -gt 0 ]; then
245
+ echo "Combining ${#coverage_files[@]} coverage files..."
246
+ if [ ${#coverage_files[@]} -eq 1 ]; then
247
+ # Only one file, just copy it
248
+ cp "${coverage_files[0]}" coverage.lcov
249
+ else
250
+ # Multiple files, combine them
251
+ lcov --rc branch_coverage=1 $(printf " -a %s" "${coverage_files[@]}") -o coverage.lcov
252
+ fi
253
+ else
254
+ echo "No coverage files to combine, creating empty coverage.lcov"
255
+ echo "TN:" > coverage.lcov
256
+ echo "end_of_record" >> coverage.lcov
257
+ fi
258
+
259
+ - name: Upload Coverage to DeepSource
260
+ if: always()
261
+ env:
262
+ DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
263
+ run: |
264
+ # Install DeepSource CLI
265
+ curl -fsSL https://deepsource.io/cli | sh
266
+ # Upload coverage report
267
+ ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
268
+
269
+ test-full:
270
+ needs: validate
271
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.validate.result == 'success'
139
272
  runs-on: ${{ matrix.os }}
140
273
  strategy:
141
274
  fail-fast: false
142
275
  matrix:
143
276
  os: [ubuntu-latest, windows-latest, macos-latest]
144
277
  python: ["3.10", "3.11", "3.12", "3.13"]
145
- timeout-minutes: 120
278
+ test-category:
279
+ - name: "core"
280
+ path: "tests/core,tests/utils"
281
+ system-deps: false
282
+ timeout: 20
283
+ - name: "extractors"
284
+ path: "tests/extractors"
285
+ system-deps: true
286
+ timeout: 25
287
+ - name: "integration"
288
+ path: "tests/integration,tests/api"
289
+ system-deps: true
290
+ timeout: 30
291
+ - name: "features"
292
+ path: "tests/features,tests/interfaces,tests/mcp,tests/multiprocessing,tests/ocr"
293
+ system-deps: true
294
+ timeout: 25
295
+ exclude:
296
+ - test-category: {name: "extractors"}
297
+ python: "3.11"
298
+ - test-category: {name: "extractors"}
299
+ python: "3.12"
300
+ - test-category: {name: "integration"}
301
+ python: "3.11"
302
+ - test-category: {name: "integration"}
303
+ python: "3.12"
304
+ - test-category: {name: "features"}
305
+ python: "3.11"
306
+ - test-category: {name: "features"}
307
+ python: "3.12"
308
+ timeout-minutes: ${{ matrix.test-category.timeout }}
146
309
  steps:
147
310
  - name: Checkout
148
311
  uses: actions/checkout@v5
@@ -180,7 +343,7 @@ jobs:
180
343
  echo "Removing existing .venv directory on Windows"
181
344
  rm -rf .venv
182
345
  fi
183
- uv sync --all-packages --all-extras --dev
346
+ uv sync --all-extras --dev
184
347
  shell: bash
185
348
 
186
349
  - name: Cache Test Artifacts
@@ -190,7 +353,7 @@ jobs:
190
353
  key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
191
354
 
192
355
  - name: Cache and Install Homebrew (macOS)
193
- if: runner.os == 'macOS'
356
+ if: runner.os == 'macOS' && matrix.test-category.system-deps
194
357
  uses: nick-fields/retry@v3
195
358
  with:
196
359
  timeout_minutes: 10
@@ -204,7 +367,7 @@ jobs:
204
367
  shell: bash
205
368
 
206
369
  - name: Cache and Install APT Packages (Linux)
207
- if: runner.os == 'Linux'
370
+ if: runner.os == 'Linux' && matrix.test-category.system-deps
208
371
  uses: nick-fields/retry@v3
209
372
  with:
210
373
  timeout_minutes: 5
@@ -216,7 +379,7 @@ jobs:
216
379
  shell: bash
217
380
 
218
381
  - name: Install System Dependencies (Windows)
219
- if: runner.os == 'Windows'
382
+ if: runner.os == 'Windows' && matrix.test-category.system-deps
220
383
  uses: nick-fields/retry@v3
221
384
  with:
222
385
  timeout_minutes: 10
@@ -231,12 +394,12 @@ jobs:
231
394
  pandoc --version
232
395
  shell: pwsh
233
396
 
234
- - name: Run Tests (without coverage)
397
+ - name: Run Tests - ${{ matrix.test-category.name }}
235
398
  uses: nick-fields/retry@v3
236
399
  with:
237
- timeout_minutes: 15
238
- max_attempts: 3
239
- retry_wait_seconds: 10
400
+ timeout_minutes: 10
401
+ max_attempts: 2
402
+ retry_wait_seconds: 5
240
403
  command: |
241
- uv run pytest -s -vvv --reruns 2 --reruns-delay 1
404
+ uv run pytest $(echo "${{ matrix.test-category.path }}" | tr ',' ' ') -v --reruns 1 --reruns-delay 1
242
405
  shell: bash
@@ -7,7 +7,7 @@ on:
7
7
  jobs:
8
8
  test-docker-images:
9
9
  runs-on: ubuntu-latest
10
- timeout-minutes: 60
10
+ timeout-minutes: 360
11
11
  strategy:
12
12
  matrix:
13
13
  image:
@@ -28,6 +28,7 @@ jobs:
28
28
  needs: test-images
29
29
  if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
30
30
  runs-on: ubuntu-latest
31
+ timeout-minutes: 360
31
32
  permissions:
32
33
  contents: read
33
34
  packages: write
@@ -5,12 +5,6 @@ repos:
5
5
  - id: commitlint
6
6
  stages: [commit-msg]
7
7
  additional_dependencies: ["@commitlint/config-conventional"]
8
- # Temporarily disabled due to CI environment issues
9
- # - repo: https://github.com/Goldziher/ai-rulez
10
- # rev: v2.0.1
11
- # hooks:
12
- # - id: ai-rulez-validate
13
- # - id: ai-rulez-generate
14
8
  - repo: https://github.com/pre-commit/pre-commit-hooks
15
9
  rev: v6.0.0
16
10
  hooks:
@@ -54,7 +48,7 @@ repos:
54
48
  hooks:
55
49
  - id: pyproject-fmt
56
50
  - repo: https://github.com/astral-sh/ruff-pre-commit
57
- rev: v0.12.12
51
+ rev: v0.13.0
58
52
  hooks:
59
53
  - id: ruff
60
54
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.13.3
3
+ Version: 3.14.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,10 +31,10 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.10.0
35
- Requires-Dist: mcp>=1.13.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.11.0
35
+ Requires-Dist: mcp>=1.14.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
- Requires-Dist: numpy>=1.24.0
37
+ Requires-Dist: numpy>=2.0.0
38
38
  Requires-Dist: playa-pdf>=0.7.0
39
39
  Requires-Dist: polars>=1.33.1
40
40
  Requires-Dist: psutil>=7.0.0
@@ -388,6 +388,18 @@ rules:
388
388
  - When committing, always use the format specified in the CLAUDE.md
389
389
  name: Important Instructions
390
390
  priority: critical
391
+ - content: |
392
+ ### Docstrings and Comments Guidelines
393
+ - **NO docstrings in private functions** (functions starting with `_`)
394
+ - **NO docstrings in private files** (files starting with `_`)
395
+ - **NO docstrings in private folders** (folders starting with `_`)
396
+ - **NO docstrings in test files** (files in `tests/` directory)
397
+ - **ONLY use docstrings in public API** (what's documented in API reference)
398
+ - **NO redundant comments** - code should be self-documenting
399
+ - **Comments only when necessary** to explain complex logic or non-obvious decisions
400
+ - **Prefer clear variable/function names** over comments
401
+ name: Documentation Standards
402
+ priority: critical
391
403
  - content: |
392
404
  ### Core Extraction Flow
393
405
  1. **Entry Point**: `extraction.py` provides main functions (`extract_file`, `extract_bytes`, etc.)
@@ -425,14 +437,68 @@ rules:
425
437
  name: Error Handling
426
438
  priority: medium
427
439
  - content: |
428
- - Test files in `tests/test_source_files/` for various formats
429
- - Mock OCR responses for predictable testing
430
- - Both sync and async test variants
431
- - Comprehensive error case coverage
432
- - OCR tests marked as `xfail` in CI environments for resilience
433
- - Integration tests use timeouts and retry logic where appropriate
440
+ ### File Organization and Naming
441
+ - **Test files MUST end with `_test.py`** - No exceptions
442
+ - **Logical directory structure**:
443
+ ```
444
+ tests/
445
+ ├── core/ # Core functionality (extraction, config, types)
446
+ ├── features/ # Feature-specific (chunking, language detection)
447
+ ├── integration/ # Integration tests (API, real file processing)
448
+ ├── interfaces/ # User interfaces (CLI, MCP)
449
+ ├── extractors/ # File format extractors (PDF, image, etc.)
450
+ ├── ocr/ # OCR backend tests
451
+ ├── utils/ # Utility function tests
452
+ └── e2e/ # End-to-end tests
453
+ ```
454
+
455
+ ### Test Structure and Patterns
456
+ - **Only function-based tests** - No class-based test methods
457
+ - **Test naming**: `test_<function>_<scenario>_<expected_outcome>`
458
+ - ✅ `test_extract_pdf_with_ocr_returns_text()`
459
+ - ✅ `test_extract_file_raises_validation_error_when_file_missing()`
460
+ - ❌ `test_basic_extraction()` (too vague)
461
+ - **Async/sync variants**: Test both async and sync versions where applicable
462
+ - **Parameterized tests**: Use `@pytest.mark.parametrize` for multiple scenarios
463
+
464
+ ### Mocking Guidelines
465
+ - **NEVER mock anyio/asyncio** - Only mock external dependencies
466
+ - **Mock external services**: OCR engines, file system operations, network calls
467
+ - **Use real objects when possible**: Prefer `tmp_path` over mocking file operations
468
+ - **Mock sparingly**: Only when necessary for isolation or performance
469
+ - **Add comments for legitimate mocks**: Explain why mocking is required
470
+ ```python
471
+ # Mock OCR backend for predictable testing ~keep
472
+ mock_ocr = mocker.patch("kreuzberg._ocr.get_backend")
473
+ ```
474
+
475
+ ### Fixtures and Test Data
476
+ - **Session-scoped fixtures** for stateless objects (extractors, configs)
477
+ - **Shared fixtures** in `tests/conftest.py` for common test data
478
+ - **Test files** in `tests/test_source_files/` for various formats
479
+ - **Temporary files** using pytest's `tmp_path` fixture
480
+
481
+ ### Test Coverage and Quality
482
+ - **95% minimum coverage** requirement
483
+ - **Test all error paths** - Every exception should be tested
484
+ - **Edge cases**: Empty inputs, large files, malformed data
485
+ - **Performance considerations**: Mark slow tests appropriately
486
+ - **CI resilience**: OCR tests marked as `xfail` in CI environments
487
+
488
+ ### Test Helpers and Utilities
489
+ - **Shared assertions** in `tests/extractors/test_helpers.py`:
490
+ - `assert_valid_extraction_result()` - Standard result validation
491
+ - `assert_extraction_error()` - Error case validation
492
+ - `assert_ocr_result()` - OCR-specific validation
493
+ - **Avoid repetitive assertions** - Use helper functions
494
+
495
+ ### Integration vs Unit Tests
496
+ - **Unit tests**: Fast, isolated, mock external dependencies
497
+ - **Integration tests**: Real file processing, external services, end-to-end flows
498
+ - **Separation**: Integration tests in `tests/integration/` directory
499
+ - **Timeouts**: Integration tests use timeouts and retry logic
434
500
  name: Testing Patterns
435
- priority: low
501
+ priority: critical
436
502
  - content: |
437
503
  ### GitHub Actions Workflows
438
504
  - **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
@@ -135,7 +135,7 @@ async def extract_tables_from_pdf():
135
135
  print(f"Table {i+1} on page {table['page_number']}:")
136
136
  print(table["text"]) # Markdown formatted table
137
137
 
138
- # Work with the pandas DataFrame
138
+ # Work with the polars DataFrame
139
139
  df = table["df"]
140
140
  print(f"Table shape: {df.shape}")
141
141
 
@@ -62,6 +62,7 @@ Extract text from one or more files.
62
62
  - Method: `POST`
63
63
  - Content-Type: `multipart/form-data`
64
64
  - Body: One or more files with field name `data`
65
+ - **Maximum file size: 1GB per file**
65
66
 
66
67
  **Response:**
67
68
 
@@ -222,6 +223,42 @@ curl -X POST "http://localhost:8000/extract?max_chars=1000" \
222
223
 
223
224
  Result: max_chars will be 500 (from header)
224
225
 
226
+ ## Interactive API Documentation
227
+
228
+ Kreuzberg automatically generates comprehensive OpenAPI documentation that you can access through your web browser when the API server is running.
229
+
230
+ ### Accessing the Documentation
231
+
232
+ Once the API server is running, you can access interactive documentation at:
233
+
234
+ - **OpenAPI Schema**: `http://localhost:8000/schema/openapi.json`
235
+ - **Swagger UI**: `http://localhost:8000/schema/swagger`
236
+ - **ReDoc Documentation**: `http://localhost:8000/schema/redoc`
237
+ - **Stoplight Elements**: `http://localhost:8000/schema/elements`
238
+ - **RapiDoc**: `http://localhost:8000/schema/rapidoc`
239
+
240
+ ### Features
241
+
242
+ The interactive documentation provides:
243
+
244
+ - **Complete API Reference**: All endpoints with detailed parameter descriptions
245
+ - **Try It Out**: Test API endpoints directly from the browser
246
+ - **Request/Response Examples**: Sample requests and responses for each endpoint
247
+ - **Schema Validation**: Interactive validation of request parameters
248
+ - **Download Options**: Export the OpenAPI specification
249
+
250
+ ### Example Usage
251
+
252
+ ```bash
253
+ # Start the API server
254
+ litestar --app kreuzberg._api.main:app run
255
+
256
+ # Open your browser to view the documentation
257
+ open http://localhost:8000/schema/swagger
258
+ ```
259
+
260
+ The documentation includes examples for all configuration options, making it easy to understand the full capabilities of the extraction API.
261
+
225
262
  #### Error Handling
226
263
 
227
264
  Invalid configuration returns appropriate error responses:
@@ -258,6 +295,27 @@ Error responses include:
258
295
  }
259
296
  ```
260
297
 
298
+ ### Debugging 500 Errors
299
+
300
+ For detailed error information when 500 Internal Server Errors occur, set the `DEBUG` environment variable:
301
+
302
+ ```bash
303
+ # Enable debug mode for detailed 500 error responses
304
+ DEBUG=1 litestar --app kreuzberg._api.main:app run
305
+
306
+ # Or with uvicorn
307
+ DEBUG=1 uvicorn kreuzberg._api.main:app --host 0.0.0.0 --port 8000
308
+ ```
309
+
310
+ When `DEBUG=1` is set, 500 errors will include:
311
+
312
+ - Full stack traces
313
+ - Detailed error context
314
+ - Internal state information
315
+ - Request debugging details
316
+
317
+ ⚠️ **Warning**: Only enable debug mode in development environments. Debug information may expose sensitive details and should never be used in production.
318
+
261
319
  ## Features
262
320
 
263
321
  - **Runtime Configuration**: Configure extraction via query parameters and HTTP headers
@@ -301,6 +359,7 @@ For production use, consider:
301
359
  1. **Monitoring**: Enable OpenTelemetry exporters
302
360
  1. **Rate Limiting**: Add rate limiting middleware
303
361
  1. **Authentication**: Add authentication middleware if needed
362
+ 1. **Security**: Ensure `DEBUG` environment variable is not set
304
363
 
305
364
  Example production command:
306
365
 
@@ -57,6 +57,13 @@ detector_base_threshold = 0.9
57
57
  remove_null_rows = true
58
58
  enable_multi_header = true
59
59
 
60
+ # DPI and Image Processing configuration
61
+ target_dpi = 150 # Target DPI for document processing
62
+ max_image_dimension = 25000 # Maximum pixel dimension before auto-scaling
63
+ auto_adjust_dpi = true # Automatically adjust DPI for large documents
64
+ min_dpi = 72 # Minimum DPI threshold
65
+ max_dpi = 600 # Maximum DPI threshold
66
+
60
67
  # Language detection configuration
61
68
  [language_detection]
62
69
  multilingual = true
@@ -91,6 +98,13 @@ auto_detect_document_type = true
91
98
  document_classification_mode = "text"
92
99
  type_confidence_threshold = 0.5
93
100
 
101
+ # DPI and Image Processing
102
+ target_dpi = 150
103
+ max_image_dimension = 25000
104
+ auto_adjust_dpi = true
105
+ min_dpi = 72
106
+ max_dpi = 600
107
+
94
108
  [tool.kreuzberg.tesseract]
95
109
  language = "eng"
96
110
  psm = 6
@@ -536,6 +550,67 @@ python -m spacy download fr_core_news_sm # French
536
550
 
537
551
  Available spaCy models include: `en_core_web_sm`, `de_core_news_sm`, `fr_core_news_sm`, `es_core_news_sm`, `pt_core_news_sm`, `it_core_news_sm`, `nl_core_news_sm`, `zh_core_web_sm`, `ja_core_news_sm`, `ko_core_news_sm`, `ru_core_news_sm`, and many others.
538
552
 
553
+ ### DPI and Image Processing
554
+
555
+ Kreuzberg provides intelligent DPI (dots per inch) configuration to optimize document processing quality and performance. This feature automatically handles image scaling for large documents while maintaining OCR quality.
556
+
557
+ ```python
558
+ from kreuzberg import extract_file, ExtractionConfig
559
+
560
+ # Default DPI configuration (optimized for most documents)
561
+ result = await extract_file("large_document.pdf")
562
+
563
+ # Custom DPI configuration for high-quality documents
564
+ config = ExtractionConfig(
565
+ target_dpi=200, # Higher quality for detailed documents
566
+ max_image_dimension=30000, # Allow larger images
567
+ auto_adjust_dpi=True, # Automatically scale down if too large
568
+ min_dpi=100, # Higher minimum for quality
569
+ max_dpi=400, # Lower maximum to control processing time
570
+ )
571
+ result = await extract_file("technical_drawing.pdf", config=config)
572
+
573
+ # Fast processing configuration for large batches
574
+ config = ExtractionConfig(
575
+ target_dpi=120, # Lower DPI for faster processing
576
+ max_image_dimension=15000, # Smaller maximum size
577
+ auto_adjust_dpi=True, # Still allow automatic scaling
578
+ )
579
+ result = await extract_file("large_batch_document.pdf", config=config)
580
+ ```
581
+
582
+ #### DPI Configuration Options
583
+
584
+ - **`target_dpi`** (default: 150): The desired DPI for document processing. Higher values provide better quality but slower processing.
585
+
586
+ - **`max_image_dimension`** (default: 25000): Maximum pixel dimension (width or height) before automatic scaling kicks in.
587
+
588
+ - **`auto_adjust_dpi`** (default: True): Automatically reduce DPI for oversized documents to stay within memory and processing limits.
589
+
590
+ - **`min_dpi`** / **`max_dpi`** (defaults: 72/600): Bounds for automatic DPI adjustment to ensure quality remains within acceptable ranges.
591
+
592
+ #### When to Adjust DPI Settings
593
+
594
+ **Increase DPI for:**
595
+
596
+ - Technical documents with small text or fine details
597
+ - Documents that will undergo further image processing
598
+ - High-quality archival processing
599
+
600
+ **Decrease DPI for:**
601
+
602
+ - Large batch processing where speed is important
603
+ - Documents with simple layouts and large text
604
+ - Memory-constrained environments
605
+
606
+ **Use auto-adjustment for:**
607
+
608
+ - Mixed document types with varying sizes
609
+ - Unknown document dimensions
610
+ - Production environments processing diverse content
611
+
612
+ The DPI system prevents "Image too large" errors while maintaining optimal quality-performance balance.
613
+
539
614
  ### Batch Processing
540
615
 
541
616
  ```python