kreuzberg 3.4.2__tar.gz → 3.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/publish-docker.yml +7 -27
  2. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/release.yaml +12 -0
  3. kreuzberg-3.6.0/.gitmodules +3 -0
  4. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/PKG-INFO +12 -4
  5. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/ai-rulez.yaml +25 -9
  6. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/benchmark_baseline.py +1 -1
  7. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/final_benchmark.py +1 -1
  8. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/serialization_benchmark.py +0 -1
  9. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/performance.md +46 -9
  10. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/types.md +18 -0
  11. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/changelog.md +2 -0
  12. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/examples/extraction-examples.md +118 -0
  13. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/installation.md +33 -1
  14. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/docker.md +49 -0
  15. kreuzberg-3.6.0/docs/user-guide/extraction-configuration.md +343 -0
  16. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/ocr-configuration.md +86 -19
  17. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/__init__.py +6 -1
  18. kreuzberg-3.6.0/kreuzberg/_entity_extraction.py +239 -0
  19. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_image.py +21 -1
  20. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_pdf.py +44 -14
  21. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_spread_sheet.py +2 -2
  22. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_gmft.py +4 -4
  23. kreuzberg-3.6.0/kreuzberg/_language_detection.py +95 -0
  24. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
  25. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/process_manager.py +2 -1
  26. kreuzberg-3.6.0/kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
  27. kreuzberg-3.6.0/kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
  28. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_easyocr.py +1 -1
  29. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_tesseract.py +7 -3
  30. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_types.py +46 -4
  31. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_device.py +2 -2
  32. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_process_pool.py +2 -2
  33. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_sync.py +1 -5
  34. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_tmp.py +2 -2
  35. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/extraction.py +39 -12
  36. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/mkdocs.yaml +1 -0
  37. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/pyproject.toml +27 -3
  38. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/api/main_test.py +2 -5
  39. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/cli_integration_test.py +9 -1
  40. kreuzberg-3.6.0/tests/entity_extraction_test.py +102 -0
  41. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extraction_test.py +9 -2
  42. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/image_test.py +17 -4
  43. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/pdf_test.py +7 -0
  44. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/gmft_extended_test.py +6 -17
  45. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/gmft_test.py +0 -3
  46. kreuzberg-3.6.0/tests/language_detection_test.py +237 -0
  47. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/sync_tesseract_test.py +2 -3
  48. kreuzberg-3.6.0/tests/test_source_files/french-text.txt +2 -0
  49. kreuzberg-3.6.0/tests/test_source_files/german-text.txt +2 -0
  50. kreuzberg-3.6.0/tests/test_source_files/spanish-text.txt +2 -0
  51. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/cache_test.py +0 -3
  52. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/errors_test.py +0 -1
  53. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/process_pool_test.py +0 -3
  54. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/sync_test.py +0 -7
  55. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/uv.lock +466 -66
  56. kreuzberg-3.4.2/docs/user-guide/extraction-configuration.md +0 -162
  57. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.commitlintrc +0 -0
  58. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.docker/Dockerfile +0 -0
  59. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.docker/README.md +0 -0
  60. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.dockerignore +0 -0
  61. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/dependabot.yaml +0 -0
  62. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/ci.yaml +0 -0
  63. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/docs.yml +0 -0
  64. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/pr-title.yaml +0 -0
  65. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.gitignore +0 -0
  66. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.markdownlint.yaml +0 -0
  67. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.pre-commit-config.yaml +0 -0
  68. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/LICENSE +0 -0
  69. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/README.md +0 -0
  70. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/README.md +0 -0
  71. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/end_to_end_benchmark.py +0 -0
  72. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/pyproject.toml +0 -0
  73. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/baseline_results.json +0 -0
  74. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  75. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  76. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/final_benchmark_results.json +0 -0
  77. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/mime_caching_results.json +0 -0
  78. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  79. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/ocr_caching_results.json +0 -0
  80. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  81. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  82. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/table_caching_results.json +0 -0
  83. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  84. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  85. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  86. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  87. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  88. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  89. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  90. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/statistical_benchmark.py +0 -0
  91. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/custom-extractors.md +0 -0
  92. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/custom-hooks.md +0 -0
  93. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/error-handling.md +0 -0
  94. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/index.md +0 -0
  95. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/exceptions.md +0 -0
  96. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/extraction-functions.md +0 -0
  97. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/extractor-registry.md +0 -0
  98. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/index.md +0 -0
  99. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/ocr-configuration.md +0 -0
  100. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/assets/favicon.png +0 -0
  101. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/assets/logo.png +0 -0
  102. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/cli.md +0 -0
  103. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/contributing.md +0 -0
  104. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/css/extra.css +0 -0
  105. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/examples/index.md +0 -0
  106. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/index.md +0 -0
  107. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/quick-start.md +0 -0
  108. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/index.md +0 -0
  109. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/api-server.md +0 -0
  110. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/basic-usage.md +0 -0
  111. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/chunking.md +0 -0
  112. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/index.md +0 -0
  113. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/metadata-extraction.md +0 -0
  114. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/ocr-backends.md +0 -0
  115. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/supported-formats.md +0 -0
  116. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/__main__.py +0 -0
  117. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_api/__init__.py +0 -0
  118. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_api/main.py +0 -0
  119. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_chunker.py +0 -0
  120. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_cli_config.py +0 -0
  121. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_constants.py +0 -0
  122. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/__init__.py +0 -0
  123. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_base.py +0 -0
  124. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_html.py +0 -0
  125. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  126. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_presentation.py +0 -0
  127. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_mime_types.py +0 -0
  128. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
  129. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
  130. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
  131. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/__init__.py +0 -0
  132. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_base.py +0 -0
  133. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  134. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_playa.py +0 -0
  135. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_registry.py +0 -0
  136. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/__init__.py +0 -0
  137. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_cache.py +0 -0
  138. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_document_cache.py +0 -0
  139. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_errors.py +0 -0
  140. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  141. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_serialization.py +0 -0
  142. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_string.py +0 -0
  143. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/cli.py +0 -0
  144. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/exceptions.py +0 -0
  145. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/py.typed +0 -0
  146. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/__init__.py +0 -0
  147. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/api/__init__.py +0 -0
  148. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/chunker_test.py +0 -0
  149. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/cli_test.py +0 -0
  150. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/conftest.py +0 -0
  151. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/exceptions_test.py +0 -0
  152. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extraction_batch_test.py +0 -0
  153. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/__init__.py +0 -0
  154. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/html_test.py +0 -0
  155. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  156. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/pandoc_test.py +0 -0
  157. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/presentation_test.py +0 -0
  158. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/spreed_sheet_test.py +0 -0
  159. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/hooks_test.py +0 -0
  160. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/mime_types_test.py +0 -0
  161. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/__init__.py +0 -0
  162. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  163. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/process_manager_test.py +0 -0
  164. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  165. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/__init__.py +0 -0
  166. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/base_test.py +0 -0
  167. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/device_integration_test.py +0 -0
  168. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/easyocr_test.py +0 -0
  169. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/init_test.py +0 -0
  170. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/paddleocr_test.py +0 -0
  171. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/tesseract_test.py +0 -0
  172. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/playa_test.py +0 -0
  173. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/registry_test.py +0 -0
  174. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/document.docx +0 -0
  175. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  176. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/excel.xlsx +0 -0
  177. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/html.html +0 -0
  178. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/markdown.md +0 -0
  179. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  180. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/non-searchable.pdf +0 -0
  181. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/ocr-image.jpg +0 -0
  182. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  183. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  184. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  185. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  186. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/sample-contract.pdf +0 -0
  187. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/scanned.pdf +0 -0
  188. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/searchable.pdf +0 -0
  189. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/test-article.pdf +0 -0
  190. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/types_test.py +0 -0
  191. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/__init__.py +0 -0
  192. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/device_test.py +0 -0
  193. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/pdf_lock_test.py +0 -0
  194. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/serialization_test.py +0 -0
  195. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/string_test.py +0 -0
  196. {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/tmp_test.py +0 -0
@@ -1,20 +1,13 @@
1
- # .github/workflows/publish-docker.yml
2
1
 
3
2
  name: Publish Docker Images
4
3
 
5
4
  on:
6
- workflow_run:
7
- workflows: ["Release"]
8
- types:
9
- - completed
10
- branches:
11
- - main
12
5
  workflow_dispatch:
13
6
 
14
7
  jobs:
15
8
  build-and-push:
16
9
  runs-on: ubuntu-latest
17
- if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
10
+ if: ${{ github.event_name == 'workflow_dispatch' }}
18
11
  permissions:
19
12
  contents: read
20
13
  packages: write
@@ -24,7 +17,7 @@ jobs:
24
17
  include:
25
18
  - name: core
26
19
  extras: ""
27
- tag_suffix: "" # The base image tag (includes API + tesseract)
20
+ tag_suffix: ""
28
21
  - name: easyocr
29
22
  extras: "easyocr"
30
23
  tag_suffix: "-easyocr"
@@ -42,27 +35,14 @@ jobs:
42
35
  - name: Checkout repository
43
36
  uses: actions/checkout@v4
44
37
  with:
45
- ref: ${{ github.event.workflow_run.head_branch || github.ref }}
38
+ ref: ${{ github.ref }}
46
39
 
47
40
  - name: Get release version
48
41
  id: get_version
49
42
  run: |
50
- if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
51
- # For manual dispatch, get the latest tag by listing all tags
52
- git fetch --tags
53
- VERSION=$(git tag --sort=-version:refname | head -n1)
54
- else
55
- # For workflow_run, use the head branch
56
- VERSION="${{ github.event.workflow_run.head_branch }}"
57
- # If triggered by a tag, extract version
58
- if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
59
- VERSION="$VERSION"
60
- else
61
- # Get the latest tag by listing all tags
62
- git fetch --tags
63
- VERSION=$(git tag --sort=-version:refname | head -n1)
64
- fi
65
- fi
43
+ # Get the latest tag by listing all tags
44
+ git fetch --tags
45
+ VERSION=$(git tag --sort=-version:refname | head -n1)
66
46
  echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
67
47
 
68
48
  - name: Set up QEMU
@@ -89,7 +69,7 @@ jobs:
89
69
  type=raw,value=latest${{ matrix.tag_suffix }}
90
70
 
91
71
  - name: Build and push Docker image
92
- uses: docker/build-push-action@v5
72
+ uses: docker/build-push-action@v6
93
73
  with:
94
74
  context: .
95
75
  file: ./.docker/Dockerfile
@@ -29,3 +29,15 @@ jobs:
29
29
 
30
30
  - name: Publish
31
31
  uses: pypa/gh-action-pypi-publish@release/v1
32
+
33
+ - name: Trigger Docker Build
34
+ uses: actions/github-script@v7
35
+ with:
36
+ github-token: ${{ secrets.GITHUB_TOKEN }}
37
+ script: |
38
+ await github.rest.actions.createWorkflowDispatch({
39
+ owner: context.repo.owner,
40
+ repo: context.repo.repo,
41
+ workflow_id: 'publish-docker.yml',
42
+ ref: 'main'
43
+ });
@@ -0,0 +1,3 @@
1
+ [submodule "python-text-extraction-libs-benchmarks"]
2
+ path = python-text-extraction-libs-benchmarks
3
+ url = https://github.com/Goldziher/python-text-extraction-libs-benchmarks.git
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.4.2
3
+ Version: 3.6.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
7
  License: MIT
8
8
  License-File: LICENSE
9
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
+ Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
36
36
  Provides-Extra: all
37
37
  Requires-Dist: click>=8.2.1; extra == 'all'
38
38
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
39
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
39
40
  Requires-Dist: gmft>=0.4.2; extra == 'all'
40
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
41
+ Requires-Dist: keybert>=0.9.0; extra == 'all'
42
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
41
43
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
42
44
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
43
45
  Requires-Dist: rich>=14.0.0; extra == 'all'
44
46
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
45
47
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
48
+ Requires-Dist: spacy>=3.8.7; extra == 'all'
46
49
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
47
50
  Provides-Extra: api
48
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
51
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
49
52
  Provides-Extra: chunking
50
53
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
51
54
  Provides-Extra: cli
@@ -54,8 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
54
57
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
55
58
  Provides-Extra: easyocr
56
59
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
60
+ Provides-Extra: entity-extraction
61
+ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
62
+ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
57
63
  Provides-Extra: gmft
58
64
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
65
+ Provides-Extra: langdetect
66
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
59
67
  Provides-Extra: paddleocr
60
68
  Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
61
69
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
@@ -1,6 +1,6 @@
1
1
  metadata:
2
2
  name: "Kreuzberg"
3
- version: "3.4.0"
3
+ version: "3.5.0"
4
4
  description: "A text extraction library supporting PDFs, images, office documents and more"
5
5
 
6
6
  outputs:
@@ -115,6 +115,7 @@ rules:
115
115
  - **OCR Backends**: Pluggable OCR engines with separate configuration classes
116
116
  - **GMFT Integration**: Table extraction using GMFT library for PDFs
117
117
  - **Chunking**: Text splitting functionality in `_chunker.py`
118
+ - **Language Detection**: Automatic language detection using fast-langdetect
118
119
  - **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
119
120
  - **API Server**: REST API using Litestar framework in `_api/main.py`
120
121
  - **CLI**: Command-line interface for batch processing and automation
@@ -144,6 +145,8 @@ rules:
144
145
  - Mock OCR responses for predictable testing
145
146
  - Both sync and async test variants
146
147
  - Comprehensive error case coverage
148
+ - OCR tests marked as `xfail` in CI environments for resilience
149
+ - Integration tests use timeouts and retry logic where appropriate
147
150
 
148
151
  - name: "Important Instructions"
149
152
  priority: 10
@@ -160,16 +163,17 @@ rules:
160
163
  priority: 6
161
164
  content: |
162
165
  ### GitHub Actions Workflows
163
- - **Release**: Automated PyPI publishing via GitHub releases
164
- - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
166
+ - **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
167
+ - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
165
168
  - **Documentation**: Auto-deploy to GitHub Pages on docs changes
169
+ - **CI**: Comprehensive testing across multiple Python versions and platforms
166
170
 
167
171
  ### Docker Variants
168
- - **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
169
- - **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
170
- - **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
171
- - **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
172
- - **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
172
+ - **Core** (`goldziher/kreuzberg:v3.5.0`): API + Tesseract OCR
173
+ - **EasyOCR** (`goldziher/kreuzberg:v3.5.0-easyocr`): Core + EasyOCR
174
+ - **PaddleOCR** (`goldziher/kreuzberg:v3.5.0-paddle`): Core + PaddleOCR
175
+ - **GMFT** (`goldziher/kreuzberg:v3.5.0-gmft`): Core + table extraction
176
+ - **All** (`goldziher/kreuzberg:v3.5.0-all`): All features included
173
177
 
174
178
  ### Manual Triggers
175
179
  - Docker builds: `gh workflow run "Publish Docker Images"`
@@ -191,8 +195,9 @@ rules:
191
195
  chunking = ["semantic-text-splitter>=0.27.0"]
192
196
  easyocr = ["easyocr>=1.7.2"]
193
197
  gmft = ["gmft>=0.4.2"]
198
+ langdetect = ["fast-langdetect>=0.2.0"]
194
199
  paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
195
- all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
200
+ all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
196
201
  ```
197
202
 
198
203
  ### Installation Patterns
@@ -207,6 +212,17 @@ rules:
207
212
  - **Development**: Uses dependency groups in pyproject.toml
208
213
 
209
214
  sections:
215
+ - title: "Language Detection"
216
+ content: |
217
+ ### Automatic Language Detection (v3.5.0+)
218
+ - **Feature**: Automatically detect languages in extracted text
219
+ - **Implementation**: Uses fast-langdetect library for high-performance detection
220
+ - **Configuration**:
221
+ - Enable with `auto_detect_language=True` in `ExtractionConfig`
222
+ - Configure via `LanguageDetectionConfig` for confidence thresholds
223
+ - **Output**: Results available in `ExtractionResult.detected_languages`
224
+ - **Integration**: Works with all extraction methods and file types
225
+
210
226
  - title: "Planned Features"
211
227
  content: |
212
228
  ### Structured Extraction (Issue #55)
@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
8
8
  from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
9
9
 
10
10
 
11
- async def run_baseline_benchmark() -> dict[str, object] | None: # type: ignore[syntax]
11
+ async def run_baseline_benchmark() -> dict[str, object] | None:
12
12
  """Run comprehensive baseline benchmark."""
13
13
  test_files_dir = Path("tests/test_source_files")
14
14
  test_files = list(test_files_dir.glob("*.pdf"))
@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
15
15
  )
16
16
 
17
17
 
18
- async def run_final_benchmark() -> dict[str, object] | None: # type: ignore[syntax]
18
+ async def run_final_benchmark() -> dict[str, object] | None:
19
19
  """Run comprehensive benchmark of all caching improvements."""
20
20
  test_files_dir = Path("tests/test_source_files")
21
21
  pdf_files = list(test_files_dir.glob("*.pdf"))
@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
101
101
  json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
102
102
  json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
103
103
 
104
- # Type casting for arithmetic operations
105
104
  json_ser_mean = json_serialize["mean"]
106
105
  json_deser_mean = json_deserialize["mean"]
107
106
  msgpack_ser_mean = msgpack_serialize["mean"]
@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
125
125
  1. **Configure OCR appropriately** for your document types
126
126
  1. **Profile your specific workload** - results vary by content
127
127
 
128
- ### Configuration Examples
128
+ ### Optimized Default Configuration
129
+
130
+ Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
129
131
 
130
132
  ```python
131
- from kreuzberg import ExtractionConfig, extract_file_sync
132
- from kreuzberg._ocr import TesseractConfig
133
+ from kreuzberg import ExtractionConfig
133
134
 
134
- # Optimized for speed
135
- fast_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=6)) # Assume uniform text block
135
+ # Default configuration - already optimized for modern documents
136
+ config = ExtractionConfig() # Uses optimized defaults:
137
+ # - PSM: AUTO_ONLY (fast without orientation detection)
138
+ # - Language model: Disabled for performance
139
+ # - Dictionary correction: Enabled for accuracy
140
+ ```
136
141
 
137
- # Optimized for accuracy
138
- accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1)) # Auto page segmentation
142
+ ### Advanced Configuration Examples
139
143
 
140
- # For simple documents (no OCR)
141
- text_only_config = ExtractionConfig(force_ocr=False, ocr_backend=None)
144
+ ```python
145
+ from kreuzberg import ExtractionConfig, extract_file_sync
146
+ from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
147
+
148
+ # Maximum speed configuration (for high-volume processing)
149
+ speed_config = ExtractionConfig(
150
+ ocr_backend="tesseract",
151
+ ocr_config=TesseractConfig(
152
+ psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
153
+ language_model_ngram_on=False, # Already disabled by default
154
+ tessedit_enable_dict_correction=False, # Disable for maximum speed
155
+ ),
156
+ )
157
+
158
+ # Maximum accuracy configuration (for degraded documents)
159
+ accuracy_config = ExtractionConfig(
160
+ ocr_backend="tesseract",
161
+ ocr_config=TesseractConfig(
162
+ psm=PSMMode.AUTO, # Full analysis with orientation detection
163
+ language_model_ngram_on=True, # Enable for historical/degraded text
164
+ tessedit_enable_dict_correction=True, # Default - keep enabled
165
+ ),
166
+ )
167
+
168
+ # No OCR configuration (text documents only)
169
+ text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
142
170
  ```
143
171
 
172
+ ### Performance Optimization Tips
173
+
174
+ Based on comprehensive benchmarking with 138+ documents:
175
+
176
+ 1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
177
+ 1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
178
+ 1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
179
+ 1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
180
+
144
181
  ### Batch Processing Best Practices
145
182
 
146
183
  ```python
@@ -40,10 +40,28 @@ Configuration options for the GMFT table extraction engine:
40
40
 
41
41
  ::: kreuzberg.GMFTConfig
42
42
 
43
+ ## Entity Extraction Configuration
44
+
45
+ Configuration options for spaCy-based entity extraction:
46
+
47
+ ::: kreuzberg.SpacyEntityExtractionConfig
48
+
49
+ ## Language Detection Configuration
50
+
51
+ Configuration options for automatic language detection:
52
+
53
+ ::: kreuzberg.LanguageDetectionConfig
54
+
43
55
  ## PSMMode (Page Segmentation Mode)
44
56
 
45
57
  ::: kreuzberg.PSMMode
46
58
 
59
+ ## Entity
60
+
61
+ Represents an extracted named entity:
62
+
63
+ ::: kreuzberg.Entity
64
+
47
65
  ## Metadata
48
66
 
49
67
  A TypedDict that contains optional metadata fields extracted from documents:
@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
12
12
  - Documentation site with comprehensive examples and API reference
13
13
  - Improved configuration for all OCR backends
14
14
  - Added hooks system for validation and post-processing
15
+ - Language detection feature with `auto_detect_language` configuration option
16
+ - New optional dependency group `langdetect` for automatic language detection
15
17
 
16
18
  ### Changed
17
19
 
@@ -79,6 +79,47 @@ async def extract_with_different_backends():
79
79
  print(f"No OCR result: {result.content[:100]}...")
80
80
  ```
81
81
 
82
+ ## Language Detection
83
+
84
+ ```python
85
+ from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
86
+
87
+ async def detect_document_language():
88
+ # Simple automatic language detection
89
+ result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
90
+
91
+ # Access detected languages
92
+ if result.detected_languages:
93
+ print(f"Detected languages: {', '.join(result.detected_languages)}")
94
+ # Example output: "Detected languages: en, de, fr"
95
+
96
+ async def detect_multilingual_document():
97
+ # Advanced multilingual detection with custom configuration
98
+ lang_config = LanguageDetectionConfig(
99
+ multilingual=True, # Detect multiple languages in mixed text
100
+ top_k=5, # Return top 5 languages
101
+ low_memory=False, # Use high accuracy mode
102
+ )
103
+
104
+ result = await extract_file(
105
+ "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
106
+ )
107
+
108
+ if result.detected_languages:
109
+ print(f"Detected languages: {result.detected_languages}")
110
+
111
+ # Use detected languages for OCR
112
+ from kreuzberg import TesseractConfig
113
+
114
+ # Create language string for Tesseract (e.g., "eng+deu+fra")
115
+ tesseract_langs = "+".join(result.detected_languages[:3])
116
+
117
+ result_with_ocr = await extract_file(
118
+ "multilingual_document.pdf",
119
+ config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
120
+ )
121
+ ```
122
+
82
123
  ## Table Extraction
83
124
 
84
125
  ```python
@@ -148,6 +189,83 @@ async def process_upload(file_content: bytes, mime_type: str):
148
189
  print(f"{key}: {value}")
149
190
  ```
150
191
 
192
+ ## Keywords
193
+
194
+ Kreuzberg supports keywords and regex extraction as follows:
195
+
196
+ ```python
197
+ from kreuzberg import ExtractionConfig, extract_file
198
+
199
+ async def extract_keywords():
200
+ config = ExtractionConfig(
201
+ extract_keywords=True,
202
+ keyword_count=5, # defaults to 10 if not set
203
+ )
204
+ result = await extract_file(
205
+ "document.pdf",
206
+ config=config,
207
+ )
208
+ print(f"Keywords: {result.keywords}")
209
+ ```
210
+
211
+ ## Entity and Keyword Extraction
212
+
213
+ Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
214
+
215
+ ```python
216
+ from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
217
+
218
+ async def extract_entities_and_keywords():
219
+ # Basic extraction
220
+ config = ExtractionConfig(
221
+ extract_entities=True,
222
+ extract_keywords=True,
223
+ keyword_count=5,
224
+ custom_entity_patterns={
225
+ "INVOICE_ID": r"INV-\d+",
226
+ "EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
227
+ },
228
+ )
229
+ result = await extract_file("document.pdf", config=config)
230
+
231
+ # Print extracted entities
232
+ if result.entities:
233
+ for entity in result.entities:
234
+ print(f"{entity.type}: {entity.text}")
235
+
236
+ # Print extracted keywords
237
+ if result.keywords:
238
+ for keyword, score in result.keywords:
239
+ print(f"Keyword: {keyword} (score: {score:.3f})")
240
+
241
+ async def extract_multilingual_entities():
242
+ # Configure spaCy for multiple languages
243
+ spacy_config = SpacyEntityExtractionConfig(
244
+ language_models={
245
+ "en": "en_core_web_sm",
246
+ "de": "de_core_news_sm",
247
+ "fr": "fr_core_news_sm",
248
+ },
249
+ fallback_to_multilingual=True,
250
+ )
251
+
252
+ config = ExtractionConfig(
253
+ auto_detect_language=True, # Automatically detect document languages
254
+ extract_entities=True,
255
+ spacy_entity_extraction_config=spacy_config,
256
+ )
257
+
258
+ result = await extract_file("multilingual_document.pdf", config=config)
259
+
260
+ if result.detected_languages:
261
+ print(f"Detected languages: {result.detected_languages}")
262
+
263
+ if result.entities:
264
+ print(f"Extracted {len(result.entities)} entities")
265
+ for entity in result.entities:
266
+ print(f" {entity.type}: {entity.text}")
267
+ ```
268
+
151
269
  ## Synchronous API
152
270
 
153
271
  For cases where async isn't needed or available:
@@ -102,6 +102,38 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
102
102
  pip install "kreuzberg[gmft]"
103
103
  ```
104
104
 
105
+ ### Language Detection
106
+
107
+ Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
108
+
109
+ ```shell
110
+ pip install "kreuzberg[langdetect]"
111
+ ```
112
+
113
+ ### Entity and Keyword Extraction
114
+
115
+ Entity and keyword extraction are optional features that extract named entities and keywords from documents. Entity extraction uses [spaCy](https://spacy.io/) for multilingual named entity recognition, while keyword extraction uses [KeyBERT](https://github.com/MaartenGr/KeyBERT) for semantic keyword extraction:
116
+
117
+ ```shell
118
+ pip install "kreuzberg[entity-extraction]"
119
+ ```
120
+
121
+ After installation, you'll need to download the spaCy language models you plan to use:
122
+
123
+ ```shell
124
+ # Download English model (most common)
125
+ python -m spacy download en_core_web_sm
126
+
127
+ # Download other language models as needed
128
+ python -m spacy download de_core_news_sm # German
129
+ python -m spacy download fr_core_news_sm # French
130
+ python -m spacy download es_core_news_sm # Spanish
131
+ ```
132
+
133
+ !!! note "Language Model Requirements"
134
+
135
+ spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
136
+
105
137
  ### All Optional Dependencies
106
138
 
107
139
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -113,5 +145,5 @@ pip install "kreuzberg[all]"
113
145
  This is equivalent to:
114
146
 
115
147
  ```shell
116
- pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
148
+ pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
117
149
  ```
@@ -124,6 +124,34 @@ Additional dependencies by variant:
124
124
  - **gmft**: GMFT for table extraction
125
125
  - **all**: All optional dependencies
126
126
 
127
+ ### Health Check
128
+
129
+ All Docker images include a health check endpoint:
130
+
131
+ ```bash
132
+ # Check API health
133
+ curl http://localhost:8000/health
134
+ ```
135
+
136
+ Returns a JSON response with service status and version information.
137
+
138
+ ### Observability
139
+
140
+ The Docker images include built-in OpenTelemetry instrumentation via Litestar:
141
+
142
+ - **Tracing**: Automatic request/response tracing
143
+ - **Metrics**: Performance and usage metrics
144
+ - **Logging**: Structured JSON logging
145
+
146
+ Configure via standard OpenTelemetry environment variables:
147
+
148
+ ```bash
149
+ docker run -p 8000:8000 \
150
+ -e OTEL_SERVICE_NAME=kreuzberg-api \
151
+ -e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
152
+ goldziher/kreuzberg:latest
153
+ ```
154
+
127
155
  ### Environment Variables
128
156
 
129
157
  - `PYTHONUNBUFFERED=1` - Ensures proper logging output
@@ -150,6 +178,12 @@ server {
150
178
  client_max_body_size 100M;
151
179
  proxy_read_timeout 300s;
152
180
  }
181
+
182
+ # Health check endpoint
183
+ location /health {
184
+ proxy_pass http://localhost:8000/health;
185
+ access_log off;
186
+ }
153
187
  }
154
188
  ```
155
189
 
@@ -175,6 +209,21 @@ spec:
175
209
  image: goldziher/kreuzberg:latest
176
210
  ports:
177
211
  - containerPort: 8000
212
+ livenessProbe:
213
+ httpGet:
214
+ path: /health
215
+ port: 8000
216
+ initialDelaySeconds: 30
217
+ periodSeconds: 10
218
+ readinessProbe:
219
+ httpGet:
220
+ path: /health
221
+ port: 8000
222
+ initialDelaySeconds: 5
223
+ periodSeconds: 5
224
+ env:
225
+ - name: OTEL_SERVICE_NAME
226
+ value: "kreuzberg-api"
178
227
  resources:
179
228
  requests:
180
229
  memory: "512Mi"