kreuzberg 3.14.1__tar.gz → 3.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (284) hide show
  1. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/docker-e2e-tests.yml +1 -1
  2. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.gitignore +27 -24
  3. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/PKG-INFO +2 -1
  4. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/README.md +1 -0
  5. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/Taskfile.yml +2 -2
  6. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/ai-rulez.yaml +1 -0
  7. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/README.md +30 -0
  8. kreuzberg-3.15.0/benchmarks/batch_size_benchmark.py +179 -0
  9. kreuzberg-3.15.0/benchmarks/batch_validation_benchmark.py +83 -0
  10. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/pyproject.toml +2 -7
  11. kreuzberg-3.15.0/benchmarks/src/__main__.py +4 -0
  12. kreuzberg-3.15.0/benchmarks/src/benchmarks.py +703 -0
  13. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/src/cli.py +215 -182
  14. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/src/models.py +10 -0
  15. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/src/profiler.py +12 -21
  16. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/src/runner.py +52 -63
  17. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/api-reference/types.md +20 -0
  18. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/examples/extraction-examples.md +265 -0
  19. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/index.md +2 -1
  20. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/api-server.md +128 -0
  21. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/extraction-configuration.md +197 -0
  22. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/supported-formats.md +10 -0
  23. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/__init__.py +6 -0
  24. kreuzberg-3.15.0/kreuzberg/_api/_config_cache.py +247 -0
  25. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_api/main.py +127 -45
  26. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_chunker.py +7 -6
  27. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_constants.py +2 -0
  28. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_document_classification.py +4 -6
  29. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_entity_extraction.py +9 -4
  30. kreuzberg-3.15.0/kreuzberg/_extractors/_base.py +328 -0
  31. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_email.py +95 -27
  32. kreuzberg-3.15.0/kreuzberg/_extractors/_html.py +121 -0
  33. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_image.py +23 -22
  34. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_pandoc.py +106 -75
  35. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_pdf.py +209 -99
  36. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_presentation.py +72 -8
  37. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_spread_sheet.py +25 -30
  38. kreuzberg-3.15.0/kreuzberg/_mcp/server.py +514 -0
  39. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_mime_types.py +42 -0
  40. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_ocr/_easyocr.py +2 -2
  41. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_ocr/_paddleocr.py +1 -1
  42. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_ocr/_tesseract.py +74 -34
  43. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_types.py +180 -21
  44. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_cache.py +10 -4
  45. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_device.py +2 -4
  46. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_image_preprocessing.py +12 -39
  47. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_process_pool.py +29 -8
  48. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_quality.py +7 -2
  49. kreuzberg-3.15.0/kreuzberg/_utils/_resource_managers.py +65 -0
  50. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_sync.py +36 -6
  51. kreuzberg-3.15.0/kreuzberg/_utils/_tmp.py +64 -0
  52. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/cli.py +34 -20
  53. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/extraction.py +43 -27
  54. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/pyproject.toml +5 -21
  55. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/api/conftest.py +1 -0
  56. kreuzberg-3.15.0/tests/api/header_config_hashing_test.py +29 -0
  57. kreuzberg-3.15.0/tests/api/image_extraction_test.py +56 -0
  58. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/api/main_test.py +0 -1
  59. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/api/runtime_config_test.py +49 -0
  60. kreuzberg-3.15.0/tests/core/config_test.py +15 -0
  61. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/core/dpi_configuration_test.py +25 -78
  62. kreuzberg-3.15.0/tests/core/extraction_batch_test.py +446 -0
  63. kreuzberg-3.15.0/tests/core/extraction_test.py +457 -0
  64. kreuzberg-3.15.0/tests/core/image_ocr_result_test.py +27 -0
  65. kreuzberg-3.15.0/tests/core/types_test.py +23 -0
  66. kreuzberg-3.15.0/tests/extractors/README_image_tests.md +85 -0
  67. kreuzberg-3.15.0/tests/extractors/base_memory_limits_test.py +100 -0
  68. kreuzberg-3.15.0/tests/extractors/base_ocr_processing_test.py +288 -0
  69. kreuzberg-3.15.0/tests/extractors/base_ocr_simple_test.py +64 -0
  70. kreuzberg-3.15.0/tests/extractors/email_error_paths_test.py +39 -0
  71. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/email_test.py +34 -10
  72. kreuzberg-3.15.0/tests/extractors/html_invalid_base64_test.py +11 -0
  73. kreuzberg-3.15.0/tests/extractors/image_deduplication_test.py +87 -0
  74. kreuzberg-3.15.0/tests/extractors/image_error_handling_test.py +251 -0
  75. kreuzberg-3.15.0/tests/extractors/image_error_simple_test.py +75 -0
  76. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/image_test.py +161 -75
  77. kreuzberg-3.15.0/tests/extractors/pdf_images_test.py +52 -0
  78. kreuzberg-3.15.0/tests/extractors/pdf_sync_images_test.py +217 -0
  79. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/pdf_test.py +26 -11
  80. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/presentation_test.py +33 -0
  81. kreuzberg-3.14.1/tests/extractors/spreed_sheet_test.py → kreuzberg-3.15.0/tests/extractors/spreadsheet_test.py +39 -28
  82. kreuzberg-3.15.0/tests/features/gmft_test.py +528 -0
  83. kreuzberg-3.15.0/tests/features/language_detection_test.py +415 -0
  84. kreuzberg-3.15.0/tests/integration/all_extractors_images_test.py +231 -0
  85. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/dpi_integration_test.py +9 -44
  86. kreuzberg-3.15.0/tests/integration/pandoc_images_test.py +30 -0
  87. kreuzberg-3.15.0/tests/integration/pdf_images_test.py +18 -0
  88. kreuzberg-3.15.0/tests/integration/pdf_real_images_test.py +52 -0
  89. kreuzberg-3.15.0/tests/integration/pptx_complex_test.py +22 -0
  90. kreuzberg-3.15.0/tests/integration/pptx_images_test.py +18 -0
  91. kreuzberg-3.15.0/tests/interfaces/mcp_server_test.py +1275 -0
  92. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/paddleocr_test.py +30 -20
  93. kreuzberg-3.15.0/tests/performance/large_pdf_perf_test.py +29 -0
  94. kreuzberg-3.15.0/tests/utils/playa_test.py +264 -0
  95. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/uv.lock +57 -25
  96. kreuzberg-3.14.1/benchmarks/src/__main__.py +0 -4
  97. kreuzberg-3.14.1/benchmarks/src/benchmarks.py +0 -302
  98. kreuzberg-3.14.1/docker-compose.example.yml +0 -26
  99. kreuzberg-3.14.1/kreuzberg/_extractors/_base.py +0 -62
  100. kreuzberg-3.14.1/kreuzberg/_extractors/_html.py +0 -43
  101. kreuzberg-3.14.1/kreuzberg/_mcp/server.py +0 -194
  102. kreuzberg-3.14.1/kreuzberg/_utils/_tmp.py +0 -28
  103. kreuzberg-3.14.1/results/baseline.json +0 -9
  104. kreuzberg-3.14.1/results/serialization.json +0 -11
  105. kreuzberg-3.14.1/results/statistical.json +0 -21
  106. kreuzberg-3.14.1/test_report.json +0 -16
  107. kreuzberg-3.14.1/tests/core/extraction_batch_test.py +0 -0
  108. kreuzberg-3.14.1/tests/core/extraction_test.py +0 -0
  109. kreuzberg-3.14.1/tests/core/types_test.py +0 -0
  110. kreuzberg-3.14.1/tests/features/gmft_test.py +0 -0
  111. kreuzberg-3.14.1/tests/features/language_detection_test.py +0 -0
  112. kreuzberg-3.14.1/tests/utils/playa_test.py +0 -0
  113. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.commitlintrc +0 -0
  114. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.deepsource.toml +0 -0
  115. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.docker/Dockerfile +0 -0
  116. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.docker/README.md +0 -0
  117. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.dockerignore +0 -0
  118. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/dependabot.yaml +0 -0
  119. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/ci.yaml +0 -0
  120. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/docs.yml +0 -0
  121. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/pr-title.yaml +0 -0
  122. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/publish-docker.yml +0 -0
  123. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/release.yaml +0 -0
  124. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.github/workflows/test-docker-builds.yml +0 -0
  125. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.markdownlint.yaml +0 -0
  126. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/.pre-commit-config.yaml +0 -0
  127. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/LICENSE +0 -0
  128. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/__init__.py +0 -0
  129. {kreuzberg-3.14.1/kreuzberg → kreuzberg-3.15.0/benchmarks}/py.typed +0 -0
  130. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/benchmarks/src/__init__.py +0 -0
  131. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docker-logs/docker-info.txt +0 -0
  132. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docker-logs/docker-version.txt +0 -0
  133. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/advanced/custom-extractors.md +0 -0
  134. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/advanced/custom-hooks.md +0 -0
  135. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/advanced/error-handling.md +0 -0
  136. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/advanced/index.md +0 -0
  137. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/advanced/performance.md +0 -0
  138. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/api-reference/exceptions.md +0 -0
  139. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/api-reference/extraction-functions.md +0 -0
  140. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/api-reference/extractor-registry.md +0 -0
  141. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/api-reference/index.md +0 -0
  142. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/api-reference/ocr-configuration.md +0 -0
  143. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/assets/favicon.png +0 -0
  144. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/assets/logo.png +0 -0
  145. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/cli.md +0 -0
  146. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/contributing.md +0 -0
  147. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/css/extra.css +0 -0
  148. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/examples/index.md +0 -0
  149. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/getting-started/index.md +0 -0
  150. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/getting-started/installation.md +0 -0
  151. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/getting-started/quick-start.md +0 -0
  152. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/basic-usage.md +0 -0
  153. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/chunking.md +0 -0
  154. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/docker.md +0 -0
  155. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/document-classification.md +0 -0
  156. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/index.md +0 -0
  157. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/mcp-server.md +0 -0
  158. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/metadata-extraction.md +0 -0
  159. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/ocr-backends.md +0 -0
  160. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/docs/user-guide/ocr-configuration.md +0 -0
  161. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/__main__.py +0 -0
  162. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_api/__init__.py +0 -0
  163. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_config.py +0 -0
  164. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/__init__.py +0 -0
  165. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_extractors/_structured.py +0 -0
  166. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_gmft.py +0 -0
  167. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_language_detection.py +0 -0
  168. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_mcp/__init__.py +0 -0
  169. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_ocr/__init__.py +0 -0
  170. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_ocr/_base.py +0 -0
  171. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
  172. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_playa.py +0 -0
  173. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_registry.py +0 -0
  174. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/__init__.py +0 -0
  175. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_document_cache.py +0 -0
  176. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_errors.py +0 -0
  177. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
  178. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  179. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_ref.py +0 -0
  180. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_serialization.py +0 -0
  181. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_string.py +0 -0
  182. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/_utils/_table.py +0 -0
  183. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/kreuzberg/exceptions.py +0 -0
  184. /kreuzberg-3.14.1/output.txt → /kreuzberg-3.15.0/kreuzberg/py.typed +0 -0
  185. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/mkdocs.yaml +0 -0
  186. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/__init__.py +0 -0
  187. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/api/__init__.py +0 -0
  188. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/conftest.py +0 -0
  189. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/core/__init__.py +0 -0
  190. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/core/exceptions_test.py +0 -0
  191. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/core/html_to_markdown_config_test.py +0 -0
  192. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/core/mime_types_test.py +0 -0
  193. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/core/registry_test.py +0 -0
  194. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/e2e/__init__.py +0 -0
  195. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/e2e/docker_e2e_test.py +0 -0
  196. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/__init__.py +0 -0
  197. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/html_test.py +0 -0
  198. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  199. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/pandoc_test.py +0 -0
  200. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/extractors/structured_test.py +0 -0
  201. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/features/__init__.py +0 -0
  202. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/features/chunker_test.py +0 -0
  203. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/features/document_classification_test.py +0 -0
  204. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/features/entity_extraction_test.py +0 -0
  205. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/features/hooks_test.py +0 -0
  206. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/__init__.py +0 -0
  207. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/api/__init__.py +0 -0
  208. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/api/large_file_test.py +0 -0
  209. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/api/mounted_config_test.py +0 -0
  210. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/multiprocessing/__init__.py +0 -0
  211. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  212. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/ocr/__init__.py +0 -0
  213. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/ocr/device_integration_test.py +0 -0
  214. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  215. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  216. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/integration/regression_test.py +0 -0
  217. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/interfaces/__init__.py +0 -0
  218. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/mcp/__init__.py +0 -0
  219. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/mcp/mcp_server_test.py +0 -0
  220. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/multiprocessing/__init__.py +0 -0
  221. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  222. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/multiprocessing/process_manager_test.py +0 -0
  223. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  224. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/__init__.py +0 -0
  225. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/base_test.py +0 -0
  226. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/easyocr_test.py +0 -0
  227. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/init_test.py +0 -0
  228. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/tesseract_test.py +0 -0
  229. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/ocr/tesseract_tsv_test.py +0 -0
  230. {kreuzberg-3.14.1/tests/utils → kreuzberg-3.15.0/tests/performance}/__init__.py +0 -0
  231. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  232. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/contract.txt +0 -0
  233. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/contract_test.txt +0 -0
  234. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/document.docx +0 -0
  235. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/email/sample-email.eml +0 -0
  236. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  237. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/excel.xlsx +0 -0
  238. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/flower-no-text.jpg +0 -0
  239. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/form_test.txt +0 -0
  240. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/french-text.txt +0 -0
  241. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/german-text.txt +0 -0
  242. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/google-doc-document.pdf +0 -0
  243. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/html.html +0 -0
  244. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  245. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/invoice_image.png +0 -0
  246. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/invoice_test.txt +0 -0
  247. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/json/sample-document.json +0 -0
  248. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  249. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/markdown.md +0 -0
  250. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  251. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/non-searchable.pdf +0 -0
  252. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/ocr-image.jpg +0 -0
  253. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  254. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  255. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  256. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  257. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/receipt_test.txt +0 -0
  258. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/report_test.txt +0 -0
  259. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/sample-contract.pdf +0 -0
  260. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/scanned.pdf +0 -0
  261. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/searchable.pdf +0 -0
  262. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
  263. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/spanish-text.txt +0 -0
  264. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/tables/borderless_table.png +0 -0
  265. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/tables/complex_document.png +0 -0
  266. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/tables/simple_table.png +0 -0
  267. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/test-article.pdf +0 -0
  268. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/test-excel.xls +0 -0
  269. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  270. /kreuzberg-3.14.1/tests/core/config_test.py → /kreuzberg-3.15.0/tests/utils/__init__.py +0 -0
  271. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/cache_test.py +0 -0
  272. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/device_test.py +0 -0
  273. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/errors_test.py +0 -0
  274. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/ocr_cache_test.py +0 -0
  275. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/pdf_lock_test.py +0 -0
  276. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/playa_helpers_test.py +0 -0
  277. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/process_pool_test.py +0 -0
  278. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/quality_test.py +0 -0
  279. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/ref_test.py +0 -0
  280. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/serialization_test.py +0 -0
  281. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/string_test.py +0 -0
  282. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/sync_test.py +0 -0
  283. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/table_test.py +0 -0
  284. {kreuzberg-3.14.1 → kreuzberg-3.15.0}/tests/utils/tmp_test.py +0 -0
@@ -99,7 +99,7 @@ jobs:
99
99
  run: |
100
100
  mkdir -p tests/e2e/logs
101
101
  echo "Running E2E tests for ${{ matrix.image.name }}..."
102
- python3 tests/e2e/docker_e2e_test.py --image ${{ matrix.image.name }}
102
+ python3 tests/e2e/docker_e2e.py --image ${{ matrix.image.name }}
103
103
 
104
104
  - name: Generate test report - ${{ matrix.image.name }}
105
105
  if: always()
@@ -1,18 +1,20 @@
1
1
  *$py.class
2
2
  *.Cache
3
- .clause/
4
3
  *.cscfg
5
4
  *.egg-info/
6
5
  *.log
7
6
  *.py[cod]
8
7
  *.suo
8
+ *.tar.gz
9
+ *.temp
10
+ *.tmp
9
11
  *.user
12
+ *.whl
10
13
  *temp/
14
+ .cache/
15
+ .claude/
11
16
  .coverage
12
17
  .coverage*
13
- coverage.lcov
14
- htmlcov/
15
- .claude/
16
18
  .cursorrules
17
19
  .dist/
18
20
  .DS_store
@@ -20,47 +22,47 @@ htmlcov/
20
22
  .idea/
21
23
  .kreuzberg/
22
24
  .mypy_cache/
25
+ .nox/
23
26
  .pytest_cache/
24
27
  .python-version
25
28
  .ropeproject
26
29
  .ruff_cache/
27
30
  .run/
31
+ .task/
32
+ .tmp/
33
+ .tox/
28
34
  .venv/
29
35
  .vscode/
30
36
  .windsurfrules
31
37
  __pycache__/
38
+ AGENTS.md
32
39
  benchmark_results.json
40
+ benchmarks/results/
41
+ build/
33
42
  CLAUDE.md
43
+ coverage.lcov
34
44
  coverage.xml
45
+ dist/
35
46
  docker-compose.yaml
47
+ docs/_build/
48
+ docs/build/
36
49
  GEMINI.md
50
+ htmlcov/
51
+ node_modules/
52
+ npm-debug.log*
53
+ output.txt
37
54
  prompt_template.egg-info/
38
55
  requirements.txt
56
+ share/python-wheels/
39
57
  site/
40
- .cache/
41
- dist/
42
- build/
43
- .task/
44
- tests/e2e/test_report.json
58
+ test_report.json
45
59
  tests/e2e/logs/
46
-
47
- # Additional build artifacts
48
- *.whl
49
- *.tar.gz
50
- .tox/
51
- .nox/
60
+ tests/e2e/test_report.json
52
61
  wheels/
53
- share/python-wheels/
54
-
55
- # Documentation builds
56
- docs/_build/
57
- docs/build/
58
-
59
- # Node.js (if any frontend tools are used)
60
- node_modules/
61
- npm-debug.log*
62
62
  yarn-debug.log*
63
63
  yarn-error.log*
64
+ todo.md
65
+ TODO.md
64
66
 
65
67
  # Temporary files
66
68
  *.tmp
@@ -69,3 +71,4 @@ yarn-error.log*
69
71
 
70
72
  # AI Rules generated files
71
73
  .claude/agents/
74
+ AGENTS.md
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.14.1
3
+ Version: 3.15.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -107,6 +107,7 @@ Description-Content-Type: text/markdown
107
107
  ### Document Intelligence Capabilities
108
108
 
109
109
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
110
+ - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
110
111
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
111
112
  - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
112
113
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
@@ -16,6 +16,7 @@
16
16
  ### Document Intelligence Capabilities
17
17
 
18
18
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
19
+ - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
19
20
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
20
21
  - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
21
22
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
@@ -16,7 +16,7 @@ tasks:
16
16
  deps:
17
17
  - docker:build
18
18
  cmds:
19
- - uv run python {{.TEST_DIR}}/docker_e2e_test.py
19
+ - uv run python {{.TEST_DIR}}/docker_e2e.py
20
20
 
21
21
  docker:build:
22
22
  desc: "Build all Docker images for testing"
@@ -67,7 +67,7 @@ tasks:
67
67
  docker:test:
68
68
  desc: "Run Docker E2E tests (images must be built)"
69
69
  cmds:
70
- - uv run python {{.TEST_DIR}}/docker_e2e_test.py
70
+ - uv run python {{.TEST_DIR}}/docker_e2e.py
71
71
 
72
72
  docker:clean:
73
73
  desc: "Clean up Docker test images and containers"
@@ -385,6 +385,7 @@ rules:
385
385
  - NEVER proactively create documentation files (*.md) or README files
386
386
  - Only create documentation files if explicitly requested by the User
387
387
  - All builtin imports should be at the top level (except for cyclical or optional dependencies)
388
+ - All config dataclasses must be hashable, frozen, and use slots: `@dataclass(unsafe_hash=True, frozen=True, slots=True)`
388
389
  - When committing, always use the format specified in the CLAUDE.md
389
390
  name: Important Instructions
390
391
  priority: critical
@@ -87,6 +87,18 @@ uv run python -m benchmarks.src run --stress
87
87
  # Run backend comparison benchmarks
88
88
  uv run python -m benchmarks.src run --backend-comparison
89
89
 
90
+ # Include Tesseract OCR benchmarks (sync)
91
+ uv run python -m benchmarks.src run --tesseract
92
+
93
+ # Include expanded Tesseract variant matrix (formats/PSM)
94
+ uv run python -m benchmarks.src run --tesseract --tesseract-matrix
95
+
96
+ # Compare Tesseract architectures (threads vs processes)
97
+ uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch
98
+
99
+ # Compare with custom worker counts (e.g., 1,4,8)
100
+ uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch --workers 1,4,8
101
+
90
102
  # Custom test files directory
91
103
  uv run python -m benchmarks.src run --test-files-dir /path/to/test/files
92
104
 
@@ -232,3 +244,21 @@ uv run python -m benchmarks.src run --sync-only --suite-name main_baseline
232
244
  uv run python -m benchmarks.src run --sync-only --suite-name pr_test
233
245
  uv run python -m benchmarks.src compare results/main_baseline.json results/pr_test.json
234
246
  ```
247
+
248
+ ### Tesseract Benchmarks
249
+
250
+ The suite includes focused Tesseract OCR benchmarks:
251
+
252
+ - `--tesseract` adds thread-based batch OCR and a process-pool placeholder for A/B comparisons.
253
+ - `--tesseract-matrix` expands with a small matrix across output formats (`text`, `markdown`, `tsv`) and PSM modes
254
+ (`AUTO`, `SINGLE_BLOCK`, `SINGLE_LINE`) to quantify overhead of richer outputs and segmentation strategies.
255
+
256
+ Examples:
257
+
258
+ ```bash
259
+ # Minimal Tesseract batch OCR benchmarks
260
+ uv run python -m benchmarks.src run --sync-only --tesseract
261
+
262
+ # Full Tesseract config matrix
263
+ uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-matrix
264
+ ```
@@ -0,0 +1,179 @@
1
+ import json
2
+ import shutil
3
+ import tempfile
4
+ import time
5
+ from concurrent.futures import ProcessPoolExecutor
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from PIL import Image, ImageDraw
10
+
11
+ from kreuzberg import extract_file_sync
12
+ from kreuzberg._ocr._tesseract import _process_image_with_tesseract
13
+ from kreuzberg._types import ExtractionConfig
14
+ from kreuzberg._utils._process_pool import get_optimal_worker_count, process_pool
15
+
16
+
17
+ def create_test_images(sizes: list[tuple[int, int]], output_dir: Path) -> list[Path]:
18
+ output_dir.mkdir(exist_ok=True)
19
+ image_paths = []
20
+
21
+ for i, (width, height) in enumerate(sizes):
22
+ img = Image.new("RGB", (width, height), color="white")
23
+ draw = ImageDraw.Draw(img)
24
+
25
+ for y in range(0, height, 50):
26
+ for x in range(0, width, 100):
27
+ draw.text((x, y), f"Test {i}", fill="black")
28
+
29
+ path = output_dir / f"test_{width}x{height}_{i}.png"
30
+ img.save(path)
31
+ image_paths.append(path)
32
+
33
+ return image_paths
34
+
35
+
36
+ def benchmark_batch_fixed_workers(images: list[Path], num_workers: int) -> dict[str, Any]:
37
+ start = time.perf_counter()
38
+ config_dict = {"language": "eng", "psm": 3}
39
+
40
+ with ProcessPoolExecutor(max_workers=num_workers) as pool:
41
+ futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
42
+ [f.result() for f in futures]
43
+
44
+ duration = time.perf_counter() - start
45
+ return {
46
+ "strategy": "fixed",
47
+ "workers": num_workers,
48
+ "batch_size": len(images),
49
+ "duration": duration,
50
+ "per_image": duration / len(images) if images else 0,
51
+ }
52
+
53
+
54
+ def benchmark_batch_dynamic_workers(images: list[Path]) -> dict[str, Any]:
55
+ start = time.perf_counter()
56
+ config_dict = {"language": "eng", "psm": 3}
57
+
58
+ optimal_workers = get_optimal_worker_count(len(images), cpu_intensive=True)
59
+
60
+ with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
61
+ futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
62
+ [f.result() for f in futures]
63
+
64
+ duration = time.perf_counter() - start
65
+ return {
66
+ "strategy": "dynamic",
67
+ "workers": optimal_workers,
68
+ "batch_size": len(images),
69
+ "duration": duration,
70
+ "per_image": duration / len(images) if images else 0,
71
+ }
72
+
73
+
74
+ def benchmark_batch_shared_pool(images: list[Path]) -> dict[str, Any]:
75
+ start = time.perf_counter()
76
+ config_dict = {"language": "eng", "psm": 3}
77
+
78
+ with process_pool() as pool:
79
+ futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
80
+ [f.result() for f in futures]
81
+
82
+ duration = time.perf_counter() - start
83
+ return {
84
+ "strategy": "shared_pool",
85
+ "workers": 14,
86
+ "batch_size": len(images),
87
+ "duration": duration,
88
+ "per_image": duration / len(images) if images else 0,
89
+ }
90
+
91
+
92
+ def benchmark_extraction_api(images: list[Path]) -> dict[str, Any]:
93
+ start = time.perf_counter()
94
+
95
+ config = ExtractionConfig(use_cache=False, force_ocr=True)
96
+
97
+ for image_path in images:
98
+ extract_file_sync(image_path, config=config)
99
+
100
+ duration = time.perf_counter() - start
101
+ return {
102
+ "strategy": "extraction_api",
103
+ "workers": "auto",
104
+ "batch_size": len(images),
105
+ "duration": duration,
106
+ "per_image": duration / len(images) if images else 0,
107
+ }
108
+
109
+
110
+ def main() -> None:
111
+ batch_sizes = [1, 2, 5, 10, 20]
112
+ image_sizes = [
113
+ (640, 480),
114
+ (1024, 768),
115
+ (1920, 1080),
116
+ ]
117
+
118
+ test_dir = Path(tempfile.mkdtemp(prefix="kreuzberg_bench_"))
119
+
120
+ results = []
121
+
122
+ for img_width, img_height in image_sizes:
123
+ max_batch = max(batch_sizes)
124
+ images = create_test_images([(img_width, img_height)] * max_batch, test_dir)
125
+
126
+ for batch_size in batch_sizes:
127
+ batch = images[:batch_size]
128
+
129
+ strategies = []
130
+
131
+ fixed_result = benchmark_batch_fixed_workers(batch, 14)
132
+ strategies.append(fixed_result)
133
+
134
+ dynamic_result = benchmark_batch_dynamic_workers(batch)
135
+ strategies.append(dynamic_result)
136
+
137
+ shared_result = benchmark_batch_shared_pool(batch)
138
+ strategies.append(shared_result)
139
+
140
+ if batch_size <= 10:
141
+ api_result = benchmark_extraction_api(batch)
142
+ strategies.append(api_result)
143
+
144
+ baseline = fixed_result["duration"]
145
+ if baseline > 0:
146
+ for strategy in strategies[1:]:
147
+ improvement = ((baseline - strategy["duration"]) / baseline) * 100
148
+ strategy["improvement_pct"] = improvement
149
+
150
+ result_entry = {
151
+ "image_size": f"{img_width}x{img_height}",
152
+ "batch_size": batch_size,
153
+ "strategies": strategies,
154
+ }
155
+ results.append(result_entry)
156
+
157
+ output_file = Path("results/batch_size_benchmarks.json")
158
+ output_file.parent.mkdir(exist_ok=True)
159
+
160
+ with output_file.open("w") as f:
161
+ json.dump({"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "results": results}, f, indent=2)
162
+
163
+ for img_size in image_sizes:
164
+ size_str = f"{img_size[0]}x{img_size[1]}"
165
+
166
+ size_results = [r for r in results if r["image_size"] == size_str]
167
+ for result in size_results:
168
+ batch_size = result["batch_size"] # type: ignore[assignment]
169
+ strategies = result["strategies"] # type: ignore[assignment]
170
+
171
+ dynamic = next((s for s in strategies if s["strategy"] == "dynamic"), None)
172
+ if dynamic and "improvement_pct" in dynamic:
173
+ pass
174
+
175
+ shutil.rmtree(test_dir)
176
+
177
+
178
+ if __name__ == "__main__":
179
+ main()
@@ -0,0 +1,83 @@
1
+ import json
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from kreuzberg import extract_file_sync
7
+ from kreuzberg._types import ExtractionConfig
8
+
9
+
10
+ def benchmark_real_world_scenario(file_paths: list[Path], scenario_name: str) -> dict[str, Any]:
11
+ config = ExtractionConfig(use_cache=False)
12
+
13
+ start = time.perf_counter()
14
+ results = []
15
+ for path in file_paths:
16
+ result = extract_file_sync(path, config=config)
17
+ results.append(len(result.content))
18
+
19
+ duration = time.perf_counter() - start
20
+
21
+ return {
22
+ "scenario": scenario_name,
23
+ "file_count": len(file_paths),
24
+ "duration": duration,
25
+ "per_file": duration / len(file_paths),
26
+ "total_chars": sum(results),
27
+ }
28
+
29
+
30
+ def main() -> None:
31
+ test_dir = Path("/Users/naamanhirschfeld/workspace/kreuzberg/tests/test_source_files")
32
+
33
+ scenarios = []
34
+
35
+ mixed_files = []
36
+ for ext in ["*.pdf", "*.docx", "*.xlsx", "*.pptx"]:
37
+ mixed_files.extend(list(test_dir.glob(ext))[:2])
38
+ if mixed_files:
39
+ result = benchmark_real_world_scenario(mixed_files, "Mixed Office Documents")
40
+ scenarios.append(result)
41
+
42
+ image_files = []
43
+ for ext in ["*.png", "*.jpg", "*.jpeg"]:
44
+ image_files.extend(list(test_dir.glob(ext))[:3])
45
+ if image_files:
46
+ result = benchmark_real_world_scenario(image_files, "Image Batch Processing")
47
+ scenarios.append(result)
48
+
49
+ pdf_files = list(test_dir.glob("*.pdf"))[:5]
50
+ if pdf_files:
51
+ result = benchmark_real_world_scenario(pdf_files, "PDF Document Processing")
52
+ scenarios.append(result)
53
+
54
+ small_files = []
55
+ for ext in ["*.txt", "*.md", "*.html"]:
56
+ small_files.extend(list(test_dir.glob(ext))[:3])
57
+ if small_files:
58
+ result = benchmark_real_world_scenario(small_files, "Small Text Files")
59
+ scenarios.append(result)
60
+
61
+ total_files = sum(s["file_count"] for s in scenarios)
62
+ total_time = sum(s["duration"] for s in scenarios)
63
+ total_chars = sum(s["total_chars"] for s in scenarios)
64
+
65
+ output = {
66
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
67
+ "scenarios": scenarios,
68
+ "summary": {
69
+ "total_files": total_files,
70
+ "total_time": total_time,
71
+ "avg_per_file": total_time / total_files if total_files > 0 else 0,
72
+ "total_chars": total_chars,
73
+ "throughput": total_chars / total_time if total_time > 0 else 0,
74
+ },
75
+ }
76
+
77
+ output_file = Path("results/final_batch_validation.json")
78
+ with output_file.open("w") as f:
79
+ json.dump(output, f, indent=2)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -17,18 +17,13 @@ classifiers = [
17
17
  # kreuzberg-bench = "src.cli:app"
18
18
 
19
19
  dependencies = [
20
+ "click>=8.2.1",
20
21
  "kreuzberg",
21
- "matplotlib>=3.7",
22
- "memory-profiler>=0.61",
23
- "pandas>=2",
22
+ "msgpack>=1.1.1",
24
23
  "psutil>=5.9",
25
24
  "py-spy>=0.3.14",
26
25
  "rich>=13",
27
- "typer>=0.9",
28
26
  ]
29
27
 
30
- [tool.ruff]
31
- lint.extend-ignore = [ "ARG002", "B008", "B904", "BLE001", "E722", "PLR2004", "PYI036", "SLF001" ]
32
-
33
28
  [tool.uv.sources]
34
29
  kreuzberg = { workspace = true }
@@ -0,0 +1,4 @@
1
+ from .cli import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()