kreuzberg 3.3.0__tar.gz → 3.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. kreuzberg-3.4.0/.docker/Dockerfile +21 -0
  2. kreuzberg-3.4.0/.docker/README.md +87 -0
  3. kreuzberg-3.4.0/.dockerignore +15 -0
  4. kreuzberg-3.4.0/.github/workflows/publish-docker.yml +101 -0
  5. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.gitignore +9 -8
  6. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/PKG-INFO +63 -8
  7. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/README.md +59 -7
  8. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/performance.md +55 -10
  9. kreuzberg-3.4.0/docs/user-guide/api-server.md +169 -0
  10. kreuzberg-3.4.0/docs/user-guide/docker.md +249 -0
  11. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/index.md +2 -0
  12. kreuzberg-3.4.0/kreuzberg/_api/main.py +87 -0
  13. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_types.py +4 -0
  14. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/mkdocs.yaml +2 -0
  15. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/pyproject.toml +18 -16
  16. kreuzberg-3.4.0/tests/api/main_test.py +252 -0
  17. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/cli_integration_test.py +2 -2
  18. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/gmft_test.py +18 -14
  19. kreuzberg-3.4.0/tests/ocr/__init__.py +0 -0
  20. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/tesseract_test.py +27 -25
  21. kreuzberg-3.4.0/tests/utils/__init__.py +0 -0
  22. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/uv.lock +441 -32
  23. kreuzberg-3.3.0/run_benchmarks.py +0 -195
  24. kreuzberg-3.3.0/scripts/__init__.py +0 -1
  25. kreuzberg-3.3.0/scripts/compare_benchmarks.py +0 -100
  26. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.commitlintrc +0 -0
  27. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/benchmarks/README.md +0 -0
  28. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/dependabot.yaml +0 -0
  29. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/workflows/ci.yaml +0 -0
  30. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/workflows/pr-title.yaml +0 -0
  31. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/workflows/release.yaml +0 -0
  32. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.markdownlint.yaml +0 -0
  33. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.pre-commit-config.yaml +0 -0
  34. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/LICENSE +0 -0
  35. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/ai-rulez.yaml +0 -0
  36. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/README.md +0 -0
  37. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/benchmark_baseline.py +0 -0
  38. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/end_to_end_benchmark.py +0 -0
  39. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/final_benchmark.py +0 -0
  40. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/pyproject.toml +0 -0
  41. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/baseline_results.json +0 -0
  42. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  43. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  44. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/final_benchmark_results.json +0 -0
  45. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/mime_caching_results.json +0 -0
  46. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  47. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/ocr_caching_results.json +0 -0
  48. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  49. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  50. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/table_caching_results.json +0 -0
  51. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/serialization_benchmark.py +0 -0
  52. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  53. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  54. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  55. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  56. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  57. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  58. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  59. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/statistical_benchmark.py +0 -0
  60. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/custom-extractors.md +0 -0
  61. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/custom-hooks.md +0 -0
  62. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/error-handling.md +0 -0
  63. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/index.md +0 -0
  64. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/exceptions.md +0 -0
  65. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/extraction-functions.md +0 -0
  66. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/extractor-registry.md +0 -0
  67. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/index.md +0 -0
  68. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/ocr-configuration.md +0 -0
  69. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/types.md +0 -0
  70. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/assets/favicon.png +0 -0
  71. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/assets/logo.png +0 -0
  72. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/changelog.md +0 -0
  73. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/cli.md +0 -0
  74. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/contributing.md +0 -0
  75. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/css/extra.css +0 -0
  76. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/examples/extraction-examples.md +0 -0
  77. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/examples/index.md +0 -0
  78. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/getting-started/index.md +0 -0
  79. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/getting-started/installation.md +0 -0
  80. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/getting-started/quick-start.md +0 -0
  81. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/index.md +0 -0
  82. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/basic-usage.md +0 -0
  83. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/chunking.md +0 -0
  84. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/extraction-configuration.md +0 -0
  85. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/metadata-extraction.md +0 -0
  86. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/ocr-backends.md +0 -0
  87. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/ocr-configuration.md +0 -0
  88. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/supported-formats.md +0 -0
  89. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/__init__.py +0 -0
  90. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/__main__.py +0 -0
  91. {kreuzberg-3.3.0/kreuzberg/_extractors → kreuzberg-3.4.0/kreuzberg/_api}/__init__.py +0 -0
  92. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_chunker.py +0 -0
  93. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_cli_config.py +0 -0
  94. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_constants.py +0 -0
  95. {kreuzberg-3.3.0/kreuzberg/_utils → kreuzberg-3.4.0/kreuzberg/_extractors}/__init__.py +0 -0
  96. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_base.py +0 -0
  97. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_html.py +0 -0
  98. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_image.py +0 -0
  99. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  100. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_pdf.py +0 -0
  101. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_presentation.py +0 -0
  102. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  103. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_gmft.py +0 -0
  104. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_mime_types.py +0 -0
  105. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
  106. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
  107. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/process_manager.py +0 -0
  108. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
  109. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
  110. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/__init__.py +0 -0
  111. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_base.py +0 -0
  112. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_easyocr.py +0 -0
  113. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  114. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_tesseract.py +0 -0
  115. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_playa.py +0 -0
  116. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_registry.py +0 -0
  117. {kreuzberg-3.3.0/tests → kreuzberg-3.4.0/kreuzberg/_utils}/__init__.py +0 -0
  118. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_cache.py +0 -0
  119. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_device.py +0 -0
  120. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_document_cache.py +0 -0
  121. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_errors.py +0 -0
  122. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  123. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_process_pool.py +0 -0
  124. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_serialization.py +0 -0
  125. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_string.py +0 -0
  126. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_sync.py +0 -0
  127. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_tmp.py +0 -0
  128. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/cli.py +0 -0
  129. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/exceptions.py +0 -0
  130. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/extraction.py +0 -0
  131. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/py.typed +0 -0
  132. {kreuzberg-3.3.0/tests/extractors → kreuzberg-3.4.0/tests}/__init__.py +0 -0
  133. {kreuzberg-3.3.0/tests/ocr → kreuzberg-3.4.0/tests/api}/__init__.py +0 -0
  134. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/chunker_test.py +0 -0
  135. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/cli_test.py +0 -0
  136. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/conftest.py +0 -0
  137. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/exceptions_test.py +0 -0
  138. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extraction_batch_test.py +0 -0
  139. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extraction_test.py +0 -0
  140. {kreuzberg-3.3.0/tests/utils → kreuzberg-3.4.0/tests/extractors}/__init__.py +0 -0
  141. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/html_test.py +0 -0
  142. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/image_test.py +0 -0
  143. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  144. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/pandoc_test.py +0 -0
  145. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/pdf_test.py +0 -0
  146. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/presentation_test.py +0 -0
  147. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/spreed_sheet_test.py +0 -0
  148. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/gmft_extended_test.py +0 -0
  149. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/hooks_test.py +0 -0
  150. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/mime_types_test.py +0 -0
  151. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/__init__.py +0 -0
  152. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  153. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/process_manager_test.py +0 -0
  154. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/sync_tesseract_test.py +0 -0
  155. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  156. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/base_test.py +0 -0
  157. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/device_integration_test.py +0 -0
  158. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/easyocr_test.py +0 -0
  159. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/init_test.py +0 -0
  160. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/paddleocr_test.py +0 -0
  161. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/playa_test.py +0 -0
  162. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/registry_test.py +0 -0
  163. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/document.docx +0 -0
  164. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  165. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/excel.xlsx +0 -0
  166. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/html.html +0 -0
  167. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/markdown.md +0 -0
  168. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  169. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/non-searchable.pdf +0 -0
  170. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/ocr-image.jpg +0 -0
  171. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  172. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  173. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  174. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  175. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/sample-contract.pdf +0 -0
  176. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/scanned.pdf +0 -0
  177. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/searchable.pdf +0 -0
  178. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/test-article.pdf +0 -0
  179. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/types_test.py +0 -0
  180. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/cache_test.py +0 -0
  181. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/device_test.py +0 -0
  182. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/errors_test.py +0 -0
  183. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/pdf_lock_test.py +0 -0
  184. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/process_pool_test.py +0 -0
  185. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/serialization_test.py +0 -0
  186. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/string_test.py +0 -0
  187. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/sync_test.py +0 -0
  188. {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/tmp_test.py +0 -0
@@ -0,0 +1,21 @@
1
+ FROM ghcr.io/astral-sh/uv:python3.13-bookworm as app
2
+ ARG EXTRAS=""
3
+ WORKDIR /app
4
+ ENV PYTHONDONTWRITEBYTECODE 1
5
+ ENV PYTHONUNBUFFERED 1
6
+ ENV UV_LINK_MODE=copy
7
+
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ pandoc \
10
+ tesseract-ocr \
11
+ && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
12
+
13
+
14
+ COPY pyproject.toml uv.lock README.md ./
15
+ COPY kreuzberg kreuzberg
16
+
17
+ RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode
18
+
19
+ RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
20
+ USER appuser
21
+ CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
@@ -0,0 +1,87 @@
1
+ # Kreuzberg Docker Images
2
+
3
+ [![GitHub](https://img.shields.io/badge/GitHub-Goldziher%2Fkreuzberg-blue)](https://github.com/Goldziher/kreuzberg)
4
+ [![PyPI](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
5
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
7
+
8
+ High-performance Python library for text extraction from documents, available as Docker images.
9
+
10
+ **Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
11
+
12
+ ## Quick Start
13
+
14
+ ```bash
15
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
16
+ ```
17
+
18
+ ## Available Tags
19
+
20
+ - `latest` - Latest stable release with API server and Tesseract OCR
21
+ - `X.Y.Z` - Specific version (e.g., `3.0.0`)
22
+ - `X.Y.Z-easyocr` - With EasyOCR support
23
+ - `X.Y.Z-paddle` - With PaddleOCR support
24
+ - `X.Y.Z-gmft` - With GMFT table extraction
25
+ - `X.Y.Z-all` - With all optional dependencies
26
+
27
+ ## Usage
28
+
29
+ ### Extract Files via API
30
+
31
+ ```bash
32
+ # Single file
33
+ curl -X POST http://localhost:8000/extract \
34
+ -F "data=@document.pdf"
35
+
36
+ # Multiple files
37
+ curl -X POST http://localhost:8000/extract \
38
+ -F "data=@document1.pdf" \
39
+ -F "data=@document2.docx"
40
+ ```
41
+
42
+ ### Docker Compose
43
+
44
+ ```yaml
45
+ version: '3.8'
46
+
47
+ services:
48
+ kreuzberg:
49
+ image: goldziher/kreuzberg:latest
50
+ ports:
51
+ - "8000:8000"
52
+ restart: unless-stopped
53
+ ```
54
+
55
+ ## Features
56
+
57
+ - **🚀 High Performance**: Optimized for speed and efficiency
58
+ - **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
59
+ - **🔍 OCR Support**: Built-in Tesseract, optional EasyOCR/PaddleOCR
60
+ - **📊 Table Extraction**: Extract tables with GMFT
61
+ - **🔒 Secure**: Runs as non-root user, no external API calls
62
+ - **📦 Ready to Use**: Pre-configured API server
63
+
64
+ ## Documentation
65
+
66
+ - **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
67
+ - **[Full Documentation](https://goldziher.github.io/kreuzberg/)** - Complete user guide and API reference
68
+ - **[API Documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/)** - REST API endpoints and usage
69
+ - **[Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/)** - Detailed Docker usage guide
70
+
71
+ ## Support
72
+
73
+ - **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
74
+ - **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
75
+ - **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
76
+
77
+ ## Contributing
78
+
79
+ Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
80
+
81
+ ## License
82
+
83
+ MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
84
+
85
+ ______________________________________________________________________
86
+
87
+ Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
@@ -0,0 +1,15 @@
1
+ *.pyc
2
+ *.pyd
3
+ *.pyo
4
+ .git
5
+ .github/
6
+ .gitignore
7
+ .idea
8
+ .mypy_cache
9
+ .pytest_cache
10
+ .ruff_cache
11
+ .vscode
12
+ __pycache__
13
+ benchmarks/
14
+ docs/
15
+ tests/
@@ -0,0 +1,101 @@
1
+ # .github/workflows/publish-docker.yml
2
+
3
+ name: Publish Docker Images
4
+
5
+ on:
6
+ workflow_run:
7
+ workflows: ["Release"]
8
+ types:
9
+ - completed
10
+ branches:
11
+ - main
12
+
13
+ jobs:
14
+ build-and-push:
15
+ runs-on: ubuntu-latest
16
+ if: ${{ github.event.workflow_run.conclusion == 'success' }}
17
+ permissions:
18
+ contents: read
19
+ packages: write
20
+
21
+ strategy:
22
+ matrix:
23
+ include:
24
+ - name: core
25
+ extras: ""
26
+ tag_suffix: "" # The base image tag (includes API + tesseract)
27
+ - name: easyocr
28
+ extras: "easyocr"
29
+ tag_suffix: "-easyocr"
30
+ - name: paddle
31
+ extras: "paddleocr"
32
+ tag_suffix: "-paddle"
33
+ - name: gmft
34
+ extras: "gmft"
35
+ tag_suffix: "-gmft"
36
+ - name: all
37
+ extras: "all"
38
+ tag_suffix: "-all"
39
+
40
+ steps:
41
+ - name: Checkout repository
42
+ uses: actions/checkout@v4
43
+ with:
44
+ ref: ${{ github.event.workflow_run.head_branch }}
45
+
46
+ - name: Get release version
47
+ id: get_version
48
+ run: |
49
+ echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
50
+ # If triggered by a tag, extract version
51
+ if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
52
+ echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
53
+ else
54
+ # Get the latest tag
55
+ git fetch --tags
56
+ echo "VERSION=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
57
+ fi
58
+
59
+ - name: Set up QEMU
60
+ uses: docker/setup-qemu-action@v3
61
+
62
+ - name: Set up Docker Buildx
63
+ uses: docker/setup-buildx-action@v3
64
+
65
+ - name: Log in to Docker Hub
66
+ uses: docker/login-action@v3
67
+ with:
68
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
69
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
70
+
71
+ - name: Extract metadata (tags, labels) for Docker
72
+ id: meta
73
+ uses: docker/metadata-action@v5
74
+ with:
75
+ images: goldziher/kreuzberg
76
+ tags: |
77
+ # Release version tag (e.g., v3.0.0-easyocr)
78
+ type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
79
+ # Latest tag for each variant (e.g., latest-easyocr)
80
+ type=raw,value=latest${{ matrix.tag_suffix }}
81
+
82
+ - name: Build and push Docker image
83
+ uses: docker/build-push-action@v5
84
+ with:
85
+ context: .
86
+ file: ./.docker/Dockerfile
87
+ platforms: linux/amd64,linux/arm64
88
+ push: true
89
+ build-args: |
90
+ EXTRAS=${{ matrix.extras }}
91
+ tags: ${{ steps.meta.outputs.tags }}
92
+ labels: ${{ steps.meta.outputs.labels }}
93
+
94
+ - name: Update Docker Hub README
95
+ uses: peter-evans/dockerhub-description@v4
96
+ if: matrix.name == 'core'
97
+ with:
98
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
99
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
100
+ repository: goldziher/kreuzberg
101
+ readme-filepath: ./.docker/README.md
@@ -6,28 +6,29 @@
6
6
  *.py[cod]
7
7
  *.suo
8
8
  *.user
9
- .DS_store
9
+ *temp/
10
10
  .coverage
11
11
  .coverage*
12
+ .cursorrules
12
13
  .dist/
14
+ .DS_store
13
15
  .env
14
16
  .idea/
17
+ .kreuzberg/
15
18
  .mypy_cache/
16
19
  .pytest_cache/
17
20
  .python-version
21
+ .ropeproject
18
22
  .ruff_cache/
19
23
  .run/
20
24
  .venv/
21
25
  .vscode/
22
26
  .windsurfrules
23
- .cursorrules
24
- CLAUDE.md
25
- GEMINI.md
26
27
  __pycache__/
28
+ benchmark_results.json
29
+ CLAUDE.md
27
30
  coverage.xml
31
+ docker-compose.yaml
32
+ GEMINI.md
28
33
  prompt_template.egg-info/
29
34
  requirements.txt
30
- Dockerfile
31
- docker-compose.yaml
32
- benchmark_results.json
33
- .kreuzberg/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.3.0
3
+ Version: 3.4.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -34,12 +34,15 @@ Provides-Extra: all
34
34
  Requires-Dist: click>=8.2.1; extra == 'all'
35
35
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
36
36
  Requires-Dist: gmft>=0.4.2; extra == 'all'
37
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
37
38
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
38
39
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
39
40
  Requires-Dist: rich>=14.0.0; extra == 'all'
40
41
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
41
42
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
42
43
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
44
+ Provides-Extra: api
45
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
43
46
  Provides-Extra: chunking
44
47
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
45
48
  Provides-Extra: cli
@@ -63,10 +66,14 @@ Description-Content-Type: text/markdown
63
66
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
64
67
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
65
68
 
66
- Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
69
+ Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
67
70
 
68
71
  ## Why Kreuzberg?
69
72
 
73
+ - **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
74
+ - **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
75
+ - **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
76
+ - **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
70
77
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
71
78
  - **Local Processing**: No external API calls or cloud dependencies required
72
79
  - **Resource Efficient**: Lightweight processing without GPU requirements
@@ -85,6 +92,9 @@ pip install kreuzberg
85
92
 
86
93
  # Or install with CLI support
87
94
  pip install "kreuzberg[cli]"
95
+
96
+ # Or install with API server
97
+ pip install "kreuzberg[api]"
88
98
  ```
89
99
 
90
100
  Install pandoc:
@@ -134,6 +144,31 @@ async def main():
134
144
  asyncio.run(main())
135
145
  ```
136
146
 
147
+ ## Docker
148
+
149
+ Docker images are available for easy deployment:
150
+
151
+ ```bash
152
+ # Run the API server
153
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
154
+
155
+ # Extract files via API
156
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
157
+ ```
158
+
159
+ See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
160
+
161
+ ## REST API
162
+
163
+ Run Kreuzberg as a REST API server:
164
+
165
+ ```bash
166
+ pip install "kreuzberg[api]"
167
+ litestar --app kreuzberg._api.main:app run
168
+ ```
169
+
170
+ See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
171
+
137
172
  ## Command Line Interface
138
173
 
139
174
  Kreuzberg includes a powerful CLI for processing documents from the command line:
@@ -208,7 +243,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
208
243
 
209
244
  ## Performance
210
245
 
211
- Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
246
+ Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
247
+
248
+ ### 🏆 Competitive Benchmarks
249
+
250
+ [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
251
+
252
+ - **Fastest Extraction**: Consistently fastest processing times across file formats
253
+ - **Lowest Memory Usage**: Most memory-efficient text extraction solution
254
+ - **100% Success Rate**: Reliable extraction across all tested document types
255
+ - **Optimal for High-Throughput**: Designed for real-time, production applications
256
+
257
+ ### 💾 Installation Size Efficiency
258
+
259
+ Kreuzberg delivers maximum performance with minimal overhead:
260
+
261
+ 1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
262
+ 1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
263
+ 1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
264
+ 1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
265
+
266
+ **Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
267
+
268
+ ### ⚡ Sync vs Async Performance
269
+
270
+ Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
212
271
 
213
272
  | Operation | Sync Time | Async Time | Async Advantage |
214
273
  | ---------------------- | --------- | ---------- | ------------------ |
@@ -218,11 +277,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
218
277
  | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
219
278
  | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
220
279
 
221
- **Rule of thumb:**
222
-
223
- - Use **sync** for simple documents and CLI applications
224
- - Use **async** for complex PDFs, OCR, and batch processing
225
- - Use **batch operations** for multiple files
280
+ **Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
226
281
 
227
282
  For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
228
283
 
@@ -5,10 +5,14 @@
5
5
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
7
 
8
- Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
8
+ Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
9
9
 
10
10
  ## Why Kreuzberg?
11
11
 
12
+ - **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
13
+ - **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
14
+ - **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
15
+ - **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
12
16
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
13
17
  - **Local Processing**: No external API calls or cloud dependencies required
14
18
  - **Resource Efficient**: Lightweight processing without GPU requirements
@@ -27,6 +31,9 @@ pip install kreuzberg
27
31
 
28
32
  # Or install with CLI support
29
33
  pip install "kreuzberg[cli]"
34
+
35
+ # Or install with API server
36
+ pip install "kreuzberg[api]"
30
37
  ```
31
38
 
32
39
  Install pandoc:
@@ -76,6 +83,31 @@ async def main():
76
83
  asyncio.run(main())
77
84
  ```
78
85
 
86
+ ## Docker
87
+
88
+ Docker images are available for easy deployment:
89
+
90
+ ```bash
91
+ # Run the API server
92
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
93
+
94
+ # Extract files via API
95
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
96
+ ```
97
+
98
+ See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
99
+
100
+ ## REST API
101
+
102
+ Run Kreuzberg as a REST API server:
103
+
104
+ ```bash
105
+ pip install "kreuzberg[api]"
106
+ litestar --app kreuzberg._api.main:app run
107
+ ```
108
+
109
+ See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
110
+
79
111
  ## Command Line Interface
80
112
 
81
113
  Kreuzberg includes a powerful CLI for processing documents from the command line:
@@ -150,7 +182,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
150
182
 
151
183
  ## Performance
152
184
 
153
- Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
185
+ Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
186
+
187
+ ### 🏆 Competitive Benchmarks
188
+
189
+ [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
190
+
191
+ - **Fastest Extraction**: Consistently fastest processing times across file formats
192
+ - **Lowest Memory Usage**: Most memory-efficient text extraction solution
193
+ - **100% Success Rate**: Reliable extraction across all tested document types
194
+ - **Optimal for High-Throughput**: Designed for real-time, production applications
195
+
196
+ ### 💾 Installation Size Efficiency
197
+
198
+ Kreuzberg delivers maximum performance with minimal overhead:
199
+
200
+ 1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
201
+ 1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
202
+ 1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
203
+ 1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
204
+
205
+ **Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
206
+
207
+ ### ⚡ Sync vs Async Performance
208
+
209
+ Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
154
210
 
155
211
  | Operation | Sync Time | Async Time | Async Advantage |
156
212
  | ---------------------- | --------- | ---------- | ------------------ |
@@ -160,11 +216,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
160
216
  | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
161
217
  | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
162
218
 
163
- **Rule of thumb:**
164
-
165
- - Use **sync** for simple documents and CLI applications
166
- - Use **async** for complex PDFs, OCR, and batch processing
167
- - Use **batch operations** for multiple files
219
+ **Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
168
220
 
169
221
  For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
170
222
 
@@ -4,18 +4,29 @@ Kreuzberg provides both synchronous and asynchronous APIs, each optimized for di
4
4
 
5
5
  ## Quick Reference
6
6
 
7
- | Use Case | Recommended API | Reason |
8
- | ------------------- | ---------------------------- | ---------------------------- |
9
- | CLI tools | `extract_file_sync()` | Lower overhead, simpler code |
10
- | Web applications | `await extract_file()` | Better concurrency |
11
- | Simple documents | `extract_file_sync()` | Faster for small files |
12
- | Complex PDFs | `await extract_file()` | Parallelized processing |
13
- | Batch processing | `await batch_extract_file()` | Concurrent execution |
14
- | OCR-heavy workloads | `await extract_file()` | Multiprocessing benefits |
7
+ | Use Case | Recommended API | Reason |
8
+ | ------------------- | ---------------------------- | -------------------------------------- |
9
+ | CLI tools | `extract_file_sync()` | Lower overhead, simpler code |
10
+ | **Backend APIs** | `await extract_file()` | **Always use async in async contexts** |
11
+ | Web applications | `await extract_file()` | Better concurrency |
12
+ | Simple documents | `extract_file_sync()` | Faster for small files |
13
+ | Complex PDFs | `await extract_file()` | Parallelized processing |
14
+ | Batch processing | `await batch_extract_file()` | Concurrent execution |
15
+ | OCR-heavy workloads | `await extract_file()` | Multiprocessing benefits |
15
16
 
16
- ## Benchmark Results
17
+ ## Competitive Performance
17
18
 
18
- All benchmarks were conducted on macOS 15.5 with ARM64 (14 cores, 48GB RAM) using Python 3.13.3.
19
+ [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries demonstrate:
20
+
21
+ - **Fastest Extraction**: Consistently fastest processing times across file formats
22
+ - **Lowest Memory Usage**: Most memory-efficient text extraction solution
23
+ - **Smallest Installation**: 71.0 MB vs competitors ranging from 145.8 MB to 1,031.9 MB
24
+ - **100% Success Rate**: Reliable extraction across all tested document types
25
+ - **Optimal for High-Throughput**: Designed for real-time, production applications
26
+
27
+ ## Internal Benchmark Results
28
+
29
+ All internal benchmarks were conducted on macOS 15.5 with ARM64 (14 cores, 48GB RAM) using Python 3.13.3.
19
30
 
20
31
  ### Single Document Processing
21
32
 
@@ -50,6 +61,29 @@ All benchmarks were conducted on macOS 15.5 with ARM64 (14 cores, 48GB RAM) usin
50
61
  1. **Simpler Path**: Direct execution without thread/process coordination
51
62
  1. **Fast Startup**: Immediate execution for quick operations
52
63
 
64
+ ### Backend API Considerations
65
+
66
+ **Important**: When working in an async context (like FastAPI, Django async views, aiohttp), **always use the async API** even for simple documents:
67
+
68
+ ```python
69
+ # ✅ Correct: Use async in async contexts
70
+ async def extract_endpoint(file_path: str):
71
+ result = await extract_file(file_path) # Non-blocking
72
+ return result
73
+
74
+ # ❌ Wrong: Sync in async context blocks the event loop
75
+ async def extract_endpoint(file_path: str):
76
+ result = extract_file_sync(file_path) # Blocks event loop!
77
+ return result
78
+ ```
79
+
80
+ **Why this matters:**
81
+
82
+ - Sync operations in async contexts block the entire event loop
83
+ - This prevents other requests from being processed concurrently
84
+ - Backend throughput drops dramatically
85
+ - Use async consistently throughout your async application stack
86
+
53
87
  ### The Crossover Point
54
88
 
55
89
  The performance crossover occurs around **10KB file size** or when **OCR is required**:
@@ -219,6 +253,17 @@ Choose your API based on your specific needs:
219
253
 
220
254
  - **Sync for simplicity**: CLI tools, simple documents, single-threaded applications
221
255
  - **Async for scale**: Web applications, batch processing, complex documents
256
+ - **Async for backends**: **Always use async in async contexts** (FastAPI, Django async, etc.)
222
257
  - **Batch for efficiency**: Multiple files, concurrent processing requirements
223
258
 
259
+ ### Key Decision Points
260
+
261
+ 1. **Are you in an async context?** → Use async API
262
+ 1. **Processing multiple files?** → Use batch operations
263
+ 1. **Simple single document in sync context?** → Sync may be faster
264
+ 1. **Complex documents or OCR required?** → Use async API
265
+ 1. **Building a web API?** → Use async API
266
+
224
267
  The performance characteristics will vary based on your specific documents, hardware, and usage patterns. We recommend benchmarking with your actual data to make informed decisions.
268
+
269
+ **Remember**: Kreuzberg is benchmarked as one of the fastest text extraction libraries available, delivering superior performance regardless of which API you choose.