kreuzberg 3.8.0__tar.gz → 3.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/PKG-INFO +31 -43
  2. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/README.md +22 -39
  3. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/index.md +11 -15
  4. kreuzberg-3.8.1/docs/performance-analysis.md +140 -0
  5. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_entity_extraction.py +1 -2
  6. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_base.py +3 -5
  7. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_image.py +18 -32
  8. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pandoc.py +3 -14
  9. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pdf.py +19 -40
  10. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_gmft.py +314 -7
  11. kreuzberg-3.8.1/kreuzberg/_ocr/__init__.py +26 -0
  12. kreuzberg-3.8.1/kreuzberg/_ocr/_base.py +113 -0
  13. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_easyocr.py +91 -0
  14. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_paddleocr.py +89 -0
  15. kreuzberg-3.8.1/kreuzberg/_ocr/_tesseract.py +996 -0
  16. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_cache.py +35 -2
  17. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_errors.py +3 -7
  18. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_process_pool.py +2 -6
  19. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/cli.py +1 -2
  20. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/extraction.py +4 -22
  21. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/mkdocs.yaml +1 -0
  22. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/pyproject.toml +17 -12
  23. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extraction_batch_test.py +4 -4
  24. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/image_test.py +52 -69
  25. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/gmft_test.py +15 -2
  26. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/gmft_isolated_test.py +11 -10
  27. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/tesseract_pool_test.py +4 -4
  28. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/base_test.py +14 -0
  29. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/easyocr_test.py +36 -0
  30. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/paddleocr_test.py +50 -0
  31. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/tesseract_test.py +44 -0
  32. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/process_pool_test.py +1 -1
  33. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/uv.lock +1 -1
  34. kreuzberg-3.8.0/kreuzberg/_multiprocessing/__init__.py +0 -5
  35. kreuzberg-3.8.0/kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  36. kreuzberg-3.8.0/kreuzberg/_ocr/__init__.py +0 -47
  37. kreuzberg-3.8.0/kreuzberg/_ocr/_base.py +0 -54
  38. kreuzberg-3.8.0/kreuzberg/_ocr/_pool.py +0 -357
  39. kreuzberg-3.8.0/kreuzberg/_ocr/_sync.py +0 -566
  40. kreuzberg-3.8.0/kreuzberg/_ocr/_tesseract.py +0 -440
  41. kreuzberg-3.8.0/tests/multiprocessing/sync_easyocr_test.py +0 -640
  42. kreuzberg-3.8.0/tests/multiprocessing/sync_paddleocr_test.py +0 -529
  43. kreuzberg-3.8.0/tests/multiprocessing/sync_tesseract_test.py +0 -362
  44. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.commitlintrc +0 -0
  45. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.docker/Dockerfile +0 -0
  46. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.docker/README.md +0 -0
  47. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.dockerignore +0 -0
  48. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/dependabot.yaml +0 -0
  49. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/ci.yaml +0 -0
  50. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/docs.yml +0 -0
  51. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/pr-title.yaml +0 -0
  52. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/publish-docker.yml +0 -0
  53. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/release.yaml +0 -0
  54. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.gitignore +0 -0
  55. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.markdownlint.yaml +0 -0
  56. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.pre-commit-config.yaml +0 -0
  57. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/LICENSE +0 -0
  58. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/ai-rulez.yaml +0 -0
  59. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/README.md +0 -0
  60. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/benchmark_baseline.py +0 -0
  61. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/end_to_end_benchmark.py +0 -0
  62. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/final_benchmark.py +0 -0
  63. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/pyproject.toml +0 -0
  64. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/baseline_results.json +0 -0
  65. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  66. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  67. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/final_benchmark_results.json +0 -0
  68. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/latest.json +0 -0
  69. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/mime_caching_results.json +0 -0
  70. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  71. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/ocr_caching_results.json +0 -0
  72. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  73. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  74. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/table_caching_results.json +0 -0
  75. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/serialization_benchmark.py +0 -0
  76. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  77. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  78. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  79. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  80. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  81. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  82. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  83. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/statistical_benchmark.py +0 -0
  84. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/custom-extractors.md +0 -0
  85. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/custom-hooks.md +0 -0
  86. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/error-handling.md +0 -0
  87. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/index.md +0 -0
  88. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/performance.md +0 -0
  89. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/exceptions.md +0 -0
  90. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/extraction-functions.md +0 -0
  91. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/extractor-registry.md +0 -0
  92. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/index.md +0 -0
  93. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/ocr-configuration.md +0 -0
  94. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/types.md +0 -0
  95. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/assets/favicon.png +0 -0
  96. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/assets/logo.png +0 -0
  97. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/changelog.md +0 -0
  98. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/cli.md +0 -0
  99. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/contributing.md +0 -0
  100. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/css/extra.css +0 -0
  101. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/examples/extraction-examples.md +0 -0
  102. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/examples/index.md +0 -0
  103. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/getting-started/index.md +0 -0
  104. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/getting-started/installation.md +0 -0
  105. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/getting-started/quick-start.md +0 -0
  106. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/api-server.md +0 -0
  107. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/basic-usage.md +0 -0
  108. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/chunking.md +0 -0
  109. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/docker.md +0 -0
  110. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/extraction-configuration.md +0 -0
  111. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/index.md +0 -0
  112. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/mcp-server.md +0 -0
  113. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/metadata-extraction.md +0 -0
  114. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-backends.md +0 -0
  115. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-configuration.md +0 -0
  116. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/supported-formats.md +0 -0
  117. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/__init__.py +0 -0
  118. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/__main__.py +0 -0
  119. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_api/__init__.py +0 -0
  120. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_api/main.py +0 -0
  121. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_chunker.py +0 -0
  122. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_cli_config.py +0 -0
  123. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_constants.py +0 -0
  124. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/__init__.py +0 -0
  125. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_email.py +0 -0
  126. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_html.py +0 -0
  127. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_presentation.py +0 -0
  128. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  129. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_structured.py +0 -0
  130. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_language_detection.py +0 -0
  131. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/__init__.py +0 -0
  132. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/server.py +0 -0
  133. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_mime_types.py +0 -0
  134. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_playa.py +0 -0
  135. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_registry.py +0 -0
  136. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_types.py +0 -0
  137. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/__init__.py +0 -0
  138. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_device.py +0 -0
  139. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_document_cache.py +0 -0
  140. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  141. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_quality.py +0 -0
  142. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_serialization.py +0 -0
  143. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_string.py +0 -0
  144. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_sync.py +0 -0
  145. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_table.py +0 -0
  146. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_tmp.py +0 -0
  147. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/exceptions.py +0 -0
  148. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/py.typed +0 -0
  149. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/__init__.py +0 -0
  150. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/api/__init__.py +0 -0
  151. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/api/main_test.py +0 -0
  152. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/chunker_test.py +0 -0
  153. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/cli_integration_test.py +0 -0
  154. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/cli_test.py +0 -0
  155. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/conftest.py +0 -0
  156. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/entity_extraction_test.py +0 -0
  157. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/exceptions_test.py +0 -0
  158. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extraction_test.py +0 -0
  159. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/__init__.py +0 -0
  160. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/email_comprehensive_test.py +0 -0
  161. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/email_test.py +0 -0
  162. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/html_test.py +0 -0
  163. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  164. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_test.py +0 -0
  165. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/pdf_test.py +0 -0
  166. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/presentation_test.py +0 -0
  167. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/spreed_sheet_test.py +0 -0
  168. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/structured_test.py +0 -0
  169. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/gmft_extended_test.py +0 -0
  170. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/hooks_test.py +0 -0
  171. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/language_detection_test.py +0 -0
  172. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/mcp_server_test.py +0 -0
  173. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/mime_types_test.py +0 -0
  174. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/__init__.py +0 -0
  175. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
  176. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/process_manager_test.py +0 -0
  177. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/__init__.py +0 -0
  178. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/device_integration_test.py +0 -0
  179. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/init_test.py +0 -0
  180. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/playa_test.py +0 -0
  181. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/registry_test.py +0 -0
  182. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
  183. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/document.docx +0 -0
  184. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/email/sample-email.eml +0 -0
  185. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  186. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/excel.xlsx +0 -0
  187. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/french-text.txt +0 -0
  188. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/german-text.txt +0 -0
  189. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/html.html +0 -0
  190. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/json/sample-document.json +0 -0
  191. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  192. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/markdown.md +0 -0
  193. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  194. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/non-searchable.pdf +0 -0
  195. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/ocr-image.jpg +0 -0
  196. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  197. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  198. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  199. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  200. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/sample-contract.pdf +0 -0
  201. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/scanned.pdf +0 -0
  202. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/searchable.pdf +0 -0
  203. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/spanish-text.txt +0 -0
  204. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/test-article.pdf +0 -0
  205. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/toml/sample-config.toml +0 -0
  206. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  207. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/types_test.py +0 -0
  208. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/__init__.py +0 -0
  209. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/cache_test.py +0 -0
  210. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/device_test.py +0 -0
  211. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/errors_test.py +0 -0
  212. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/pdf_lock_test.py +0 -0
  213. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/serialization_test.py +0 -0
  214. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/string_test.py +0 -0
  215. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/sync_test.py +0 -0
  216. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/table_test.py +0 -0
  217. {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/tmp_test.py +0 -0
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.8.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
3
+ Version: 3.8.1
4
+ Summary: Advanced document intelligence framework for extracting structured content from PDFs, images, and office documents
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
7
  License: MIT
8
8
  License-File: LICENSE
9
- Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
9
+ Keywords: automation,content-extraction,data-processing,document-analysis,document-intelligence,document-processing,entity-extraction,image-to-text,information-extraction,ocr,pdf-extraction,rag,structured-data,table-extraction,text-extraction
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: Intended Audience :: Science/Research
12
14
  Classifier: License :: OSI Approved :: MIT License
13
15
  Classifier: Operating System :: OS Independent
14
16
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -16,10 +18,13 @@ Classifier: Programming Language :: Python :: 3.10
16
18
  Classifier: Programming Language :: Python :: 3.11
17
19
  Classifier: Programming Language :: Python :: 3.12
18
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
23
+ Classifier: Topic :: Office/Business :: Office Suites
19
24
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
27
  Classifier: Topic :: Text Processing :: General
22
- Classifier: Topic :: Utilities
23
28
  Classifier: Typing :: Typed
24
29
  Requires-Python: >=3.10
25
30
  Requires-Dist: anyio>=4.9.0
@@ -83,49 +88,31 @@ Description-Content-Type: text/markdown
83
88
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
84
89
  [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
85
90
 
86
- **High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
91
+ **Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
87
92
 
88
93
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
89
94
 
90
95
  ## Why Choose Kreuzberg?
91
96
 
92
- ### 🚀 Performance
97
+ ### Proven Performance
93
98
 
94
- - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
95
- - Minimal footprint: 71MB install vs 1GB+ for competitors
96
- - Lowest memory usage (~530MB average) optimized for production workloads
97
- - Edge and serverless ready - deploy anywhere without heavy dependencies
99
+ [Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
98
100
 
99
- ### 🛠️ Engineering Quality
101
+ ### 🏗️ Production Engineering
100
102
 
101
- - Built by software engineers with modern Python best practices
102
- - 95%+ test coverage with comprehensive test suite
103
- - Thoroughly benchmarked and profiled for real-world performance
104
- - Only framework offering true async/await support alongside sync APIs
105
- - Robust error handling and detailed logging
103
+ Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
106
104
 
107
- ### 🎯 Developer Experience
105
+ ### 🔧 Developer Experience
108
106
 
109
- - Works out of the box with sane defaults, scales with your needs
110
- - Native MCP server for AI tool integration (Claude Desktop, Cursor)
111
- - Full type safety with excellent IDE support (completions)
112
- - Comprehensive documentation including full API reference
107
+ Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
113
108
 
114
- ### 🌍 Deployment Options
109
+ ### 🚀 Flexible Deployment
115
110
 
116
- - Docker images for all architectures (AMD64, ARM64)
117
- - Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
118
- - CPU-only processing - no GPU requirements, lower energy consumption
119
- - 100% local processing - no external API dependencies
120
- - Multiple deployment modes: CLI, REST API, MCP server
111
+ Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
121
112
 
122
- ### 🎯 Complete Solution
113
+ ### 📄 Comprehensive Format Support
123
114
 
124
- - Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
125
- - Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
126
- - Advanced features: Table extraction, metadata extraction, content chunking for RAG
127
- - Production tools: REST API, CLI tools, batch processing, custom extractors
128
- - Fully extensible: Add your own extractors
115
+ Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
129
116
 
130
117
  ## Quick Start
131
118
 
@@ -161,7 +148,7 @@ import asyncio
161
148
  from kreuzberg import extract_file
162
149
 
163
150
  async def main():
164
- # Extract from any document type
151
+ # Extract content from files
165
152
  result = await extract_file("document.pdf")
166
153
  print(result.content)
167
154
  print(result.metadata)
@@ -275,23 +262,23 @@ kreuzberg extract *.pdf --output-dir ./extracted/
275
262
 
276
263
  ## 📊 Performance Comparison
277
264
 
278
- [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
265
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
279
266
 
280
- | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
281
- | ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
282
- | **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
283
- | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
284
- | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
285
- | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
267
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
268
+ | ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
269
+ | **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
270
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
271
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
272
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
286
273
 
287
274
  \*_Performance varies significantly with document complexity and size_
288
275
 
289
276
  **Key strengths:**
290
277
 
291
- - 2-3x faster processing than comparable frameworks
278
+ - 6-126x faster processing than comparable frameworks
292
279
  - Smallest installation footprint and memory usage
293
280
  - Only framework with built-in async/await support
294
- - CPU-only processing - no GPU dependencies
281
+ - Supports both CPU and GPU processing
295
282
  - Built by software engineers for production reliability
296
283
 
297
284
  > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
@@ -302,6 +289,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
302
289
 
303
290
  - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
304
291
  - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
292
+ - [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
305
293
  - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
306
294
  - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
307
295
  - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
@@ -6,49 +6,31 @@
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
7
  [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
8
8
 
9
- **High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
9
+ **Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
10
10
 
11
11
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
12
12
 
13
13
  ## Why Choose Kreuzberg?
14
14
 
15
- ### 🚀 Performance
15
+ ### Proven Performance
16
16
 
17
- - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
18
- - Minimal footprint: 71MB install vs 1GB+ for competitors
19
- - Lowest memory usage (~530MB average) optimized for production workloads
20
- - Edge and serverless ready - deploy anywhere without heavy dependencies
17
+ [Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
21
18
 
22
- ### 🛠️ Engineering Quality
19
+ ### 🏗️ Production Engineering
23
20
 
24
- - Built by software engineers with modern Python best practices
25
- - 95%+ test coverage with comprehensive test suite
26
- - Thoroughly benchmarked and profiled for real-world performance
27
- - Only framework offering true async/await support alongside sync APIs
28
- - Robust error handling and detailed logging
21
+ Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
29
22
 
30
- ### 🎯 Developer Experience
23
+ ### 🔧 Developer Experience
31
24
 
32
- - Works out of the box with sane defaults, scales with your needs
33
- - Native MCP server for AI tool integration (Claude Desktop, Cursor)
34
- - Full type safety with excellent IDE support (completions)
35
- - Comprehensive documentation including full API reference
25
+ Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
36
26
 
37
- ### 🌍 Deployment Options
27
+ ### 🚀 Flexible Deployment
38
28
 
39
- - Docker images for all architectures (AMD64, ARM64)
40
- - Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
41
- - CPU-only processing - no GPU requirements, lower energy consumption
42
- - 100% local processing - no external API dependencies
43
- - Multiple deployment modes: CLI, REST API, MCP server
29
+ Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
44
30
 
45
- ### 🎯 Complete Solution
31
+ ### 📄 Comprehensive Format Support
46
32
 
47
- - Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
48
- - Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
49
- - Advanced features: Table extraction, metadata extraction, content chunking for RAG
50
- - Production tools: REST API, CLI tools, batch processing, custom extractors
51
- - Fully extensible: Add your own extractors
33
+ Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
52
34
 
53
35
  ## Quick Start
54
36
 
@@ -84,7 +66,7 @@ import asyncio
84
66
  from kreuzberg import extract_file
85
67
 
86
68
  async def main():
87
- # Extract from any document type
69
+ # Extract content from files
88
70
  result = await extract_file("document.pdf")
89
71
  print(result.content)
90
72
  print(result.metadata)
@@ -198,23 +180,23 @@ kreuzberg extract *.pdf --output-dir ./extracted/
198
180
 
199
181
  ## 📊 Performance Comparison
200
182
 
201
- [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
183
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
202
184
 
203
- | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
204
- | ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
205
- | **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
206
- | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
207
- | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
208
- | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
185
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
186
+ | ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
187
+ | **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
188
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
189
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
190
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
209
191
 
210
192
  \*_Performance varies significantly with document complexity and size_
211
193
 
212
194
  **Key strengths:**
213
195
 
214
- - 2-3x faster processing than comparable frameworks
196
+ - 6-126x faster processing than comparable frameworks
215
197
  - Smallest installation footprint and memory usage
216
198
  - Only framework with built-in async/await support
217
- - CPU-only processing - no GPU dependencies
199
+ - Supports both CPU and GPU processing
218
200
  - Built by software engineers for production reliability
219
201
 
220
202
  > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
@@ -225,6 +207,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
225
207
 
226
208
  - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
227
209
  - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
210
+ - [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
228
211
  - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
229
212
  - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
230
213
  - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
@@ -1,23 +1,19 @@
1
1
  # Kreuzberg
2
2
 
3
- Kreuzberg is a complete Open Source Document Intelligence framework. Its Built by engineers for production workloads -
4
- its not a data science / research orientated tool, but rather a pragmatic swiss-army knife that is meant to deliver.
5
- Yes, Python, when coupled with robust technologies such as `pdfium`, `tesseract` and `pandoc` can do quite a lot.
6
- Kreuzberg was also created (primarily) in Kreuzberg - the famous and beautiful neighborhood of Berlin.
3
+ Kreuzberg is an advanced open source document intelligence framework built for production workloads. Designed by engineers for reliability and performance, it transforms PDFs, images, and office documents into structured data with minimal complexity.
4
+
5
+ Built on proven technologies including PDFium, Tesseract, and Pandoc, Kreuzberg delivers enterprise-grade document processing capabilities while maintaining simplicity and speed.
7
6
 
8
7
  ## Why Kreuzberg?
9
8
 
10
- At the danger of over-selling, there are actually quite a lot of reasons why use Kreuzberg. You can read them below.
11
- BUT - this is not necessarily a mutually exclusive solution. For example.
12
- many text extraction pipelines can integrate a library such as Kreuzberg with some kind of heuristics on when to use it
13
- and when use something else.
9
+ Kreuzberg addresses real production needs with measurable benefits. While not exclusively a complete solution, it integrates well with existing pipelines and can be deployed alongside other tools based on specific requirements.
14
10
 
15
11
  ### 🚀 Performance
16
12
 
17
- - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x
18
- faster than the nearest alternatives
19
- - Minimal footprint: 71MB install vs 1GB+ for competitors
20
- - Lowest memory usage (~530MB average) optimized for production workloads
13
+ - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 6-126x
14
+ faster than competitors
15
+ - Minimal footprint: 87MB install vs 1GB+ for competitors
16
+ - Lowest memory usage (~360MB average) optimized for production workloads
21
17
  - Edge and serverless ready - deploy anywhere without heavy dependencies
22
18
 
23
19
  ### 🛠️ Engineering Quality
@@ -39,13 +35,13 @@ and when use something else.
39
35
 
40
36
  - Docker images for all architectures (AMD64, ARM64)
41
37
  - Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
42
- - CPU-only processing - no GPU requirements, lower energy consumption
43
- - 100% local processing - no external API dependencies
38
+ - Supports both CPU and GPU processing (PaddleOCR, EasyOCR)
39
+ - Local processing - no external API dependencies
44
40
  - Multiple deployment modes: CLI, REST API, MCP server
45
41
 
46
42
  ### 🎯 Complete Solution
47
43
 
48
- - Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
44
+ - Comprehensive format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
49
45
  - Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
50
46
  - Advanced features: Table extraction, metadata extraction, content chunking for RAG
51
47
  - Production tools: REST API, CLI tools, batch processing, custom extractors
@@ -0,0 +1,140 @@
1
+ # Performance Analysis
2
+
3
+ ## Overview
4
+
5
+ This page presents comprehensive benchmark results comparing Kreuzberg against other text extraction frameworks. All data is derived from rigorous testing across ~100 real-world documents using standardized methodology.
6
+
7
+ > **Benchmark Methodology**: Results based on the [python-text-extraction-libraries-benchmarks-2025](https://github.com/Goldziher/python-text-extraction-libraries-benchmarks-2025) project with comprehensive testing across multiple document types and sizes.
8
+
9
+ ## Executive Summary
10
+
11
+ Kreuzberg demonstrates exceptional performance across all key metrics:
12
+
13
+ - **Speed**: 6-126x faster than competitors
14
+ - **Memory**: 2-4x lower usage
15
+ - **Installation**: 2-68x smaller footprint
16
+ - **Reliability**: Perfect 100% success rate
17
+
18
+ ## Detailed Performance Metrics
19
+
20
+ ### Processing Speed
21
+
22
+ #### By File Size Category
23
+
24
+ | Category | Kreuzberg Sync | Kreuzberg Async | Best Competitor | Advantage |
25
+ | --------------------- | -------------- | --------------- | --------------- | ----------- |
26
+ | **Tiny (\<100KB)** | 31.6 files/sec | 23.6 files/sec | 4.8 files/sec | 6.6x faster |
27
+ | **Small (100KB-1MB)** | 9.0 files/sec | 10.1 files/sec | 3.6 files/sec | 2.8x faster |
28
+ | **Medium (1-10MB)** | 2.6 files/sec | 3.2 files/sec | 0.065 files/sec | 49x faster |
29
+
30
+ #### Processing Time Comparison
31
+
32
+ | Framework | Tiny Files (s) | Small Files (s) | Medium Files (s) |
33
+ | ------------------- | -------------- | --------------- | ---------------- |
34
+ | **Kreuzberg Sync** | 0.032 | 0.111 | 0.388 |
35
+ | **Kreuzberg Async** | 0.042 | 0.099 | 0.315 |
36
+ | Extractous | 0.316 | 0.281 | 15.38 |
37
+ | Unstructured | 0.210 | 1.123 | - |
38
+ | Docling | 3.956 | 14.47 | - |
39
+
40
+ ### Memory Usage
41
+
42
+ | Framework | Average Memory (MB) | vs Kreuzberg |
43
+ | ------------------- | ------------------- | ------------ |
44
+ | **Kreuzberg Sync** | 360 | Baseline |
45
+ | **Kreuzberg Async** | 396 | +10% |
46
+ | Extractous | 513 | +43% |
47
+ | Unstructured | 1,389 | +286% |
48
+ | Docling | 1,838 | +411% |
49
+
50
+ ### Installation Size
51
+
52
+ | Framework | Size (MB) | Packages | vs Kreuzberg |
53
+ | ------------- | --------- | -------- | ------------ |
54
+ | **Kreuzberg** | 87 | 43 | Baseline |
55
+ | Unstructured | 176 | 54 | 2.0x larger |
56
+ | MarkItDown | 208 | 25 | 2.4x larger |
57
+ | Docling | 5,900 | 103 | 67.8x larger |
58
+
59
+ ### Success Rate & Reliability
60
+
61
+ | Framework | Tiny Files | Small Files | Medium Files | Overall |
62
+ | ------------- | ---------- | ----------- | ------------ | -------- |
63
+ | **Kreuzberg** | 100% | 100% | 100% | **100%** |
64
+ | Extractous | 100% | 95.8% | 100% | 98.6% |
65
+ | Unstructured | 100% | 100% | - | 100% |
66
+ | Docling | 100% | 96.3% | - | 98.2% |
67
+
68
+ ### Content Extraction Quality
69
+
70
+ #### Characters Extracted (Average)
71
+
72
+ | Framework | Tiny Files | Small Files | Medium Files |
73
+ | ------------- | ---------- | ----------- | ------------ |
74
+ | **Kreuzberg** | 6,950 | 173,505 | 500,643 |
75
+ | Extractous | 6,894 | 106,641 | 251,612 |
76
+ | Unstructured | 3,842 | 70,396 | - |
77
+ | Docling | 3,316 | 59,129 | - |
78
+
79
+ ## Performance Insights
80
+
81
+ ### Speed Advantages
82
+
83
+ 1. **Optimized Processing Pipeline**: Efficient async/await implementation
84
+ 1. **Smart Resource Management**: Minimal overhead operations
85
+ 1. **Native Libraries**: Built on high-performance C libraries (PDFium, Tesseract)
86
+
87
+ ### Memory Efficiency
88
+
89
+ 1. **Lean Architecture**: Minimal memory footprint during processing
90
+ 1. **Resource Cleanup**: Proper resource disposal and garbage collection
91
+ 1. **Streaming Processing**: Process large files without loading entirely into memory
92
+
93
+ ### Installation Benefits
94
+
95
+ 1. **Minimal Dependencies**: Only essential packages included
96
+ 1. **No Heavy ML Models**: CPU-focused processing without large model files
97
+ 1. **Efficient Packaging**: Optimized distribution with selective dependencies
98
+
99
+ ## Production Implications
100
+
101
+ ### Cost Savings
102
+
103
+ - **Infrastructure**: 2-4x lower memory requirements reduce server costs
104
+ - **Storage**: 2-68x smaller installation saves disk space
105
+ - **Processing**: 6-126x faster execution reduces compute time
106
+
107
+ ### Operational Benefits
108
+
109
+ - **Deployment Speed**: Faster installations and updates
110
+ - **Resource Planning**: Predictable memory and CPU usage
111
+ - **Scaling**: Efficient resource utilization enables higher throughput
112
+
113
+ ### Developer Experience
114
+
115
+ - **Quick Setup**: Minimal installation time and complexity
116
+ - **Reliable Performance**: Consistent results across document types
117
+ - **Production Ready**: Battle-tested performance characteristics
118
+
119
+ ## Test Environment
120
+
121
+ **Hardware**: Linux CI runners
122
+ **Python Version**: 3.13
123
+ **Document Corpus**: ~100 real-world documents tested across multiple frameworks
124
+ **Test Date**: July 13, 2025
125
+ **Methodology**: [Full methodology available](https://github.com/Goldziher/python-text-extraction-libraries-benchmarks-2025)
126
+
127
+ ## Framework Comparison Matrix
128
+
129
+ | Metric | Kreuzberg | Extractous | Unstructured | Docling |
130
+ | ------------------- | --------- | ---------- | ------------ | ------- |
131
+ | **Speed** | ★★★★★ | ★★☆☆☆ | ★★☆☆☆ | ★☆☆☆☆ |
132
+ | **Memory** | ★★★★★ | ★★★★☆ | ★★☆☆☆ | ★☆☆☆☆ |
133
+ | **Installation** | ★★★★★ | - | ★★★☆☆ | ★☆☆☆☆ |
134
+ | **Reliability** | ★★★★★ | ★★★★☆ | ★★★★★ | ★★★★☆ |
135
+ | **Content Quality** | ★★★★★ | ★★★☆☆ | ★★★☆☆ | ★★☆☆☆ |
136
+ | **Overall** | ★★★★★ | ★★★☆☆ | ★★★☆☆ | ★★☆☆☆ |
137
+
138
+ ______________________________________________________________________
139
+
140
+ *Performance data is based on comprehensive benchmarking across real-world document corpus. Results may vary based on specific use cases and hardware configurations.*
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  import re
4
5
  from dataclasses import dataclass
5
6
  from functools import lru_cache
@@ -181,8 +182,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
181
182
  import spacy
182
183
 
183
184
  if spacy_config.model_cache_dir:
184
- import os
185
-
186
185
  os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
187
186
 
188
187
  nlp = spacy.load(model_name)
@@ -3,10 +3,12 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING, ClassVar
5
5
 
6
+ from kreuzberg._types import ExtractionResult, normalize_metadata
7
+ from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
8
+
6
9
  if TYPE_CHECKING:
7
10
  from pathlib import Path
8
11
 
9
- from kreuzberg import ExtractionResult
10
12
  from kreuzberg._types import ExtractionConfig
11
13
 
12
14
 
@@ -104,8 +106,6 @@ class Extractor(ABC):
104
106
  if not self.config.enable_quality_processing:
105
107
  return result
106
108
 
107
- from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
108
-
109
109
  if not result.content:
110
110
  return result
111
111
 
@@ -120,8 +120,6 @@ class Extractor(ABC):
120
120
  enhanced_metadata["quality_score"] = quality_score
121
121
 
122
122
  # Return enhanced result
123
- from kreuzberg._types import ExtractionResult, normalize_metadata
124
-
125
123
  return ExtractionResult(
126
124
  content=cleaned_content,
127
125
  mime_type=result.mime_type,
@@ -11,13 +11,17 @@ from anyio import Path as AsyncPath
11
11
  from kreuzberg._extractors._base import Extractor
12
12
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
13
13
  from kreuzberg._ocr import get_ocr_backend
14
- from kreuzberg._types import ExtractionResult
14
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
15
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
16
+ from kreuzberg._ocr._tesseract import TesseractConfig
15
17
  from kreuzberg._utils._tmp import create_temp_file
16
18
  from kreuzberg.exceptions import ValidationError
17
19
 
18
20
  if TYPE_CHECKING: # pragma: no cover
19
21
  from collections.abc import Mapping
20
22
 
23
+ from kreuzberg._types import ExtractionResult
24
+
21
25
 
22
26
  class ImageExtractor(Extractor):
23
27
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -78,44 +82,26 @@ class ImageExtractor(Extractor):
78
82
  if self.config.ocr_backend is None:
79
83
  raise ValidationError("ocr_backend is None, cannot perform OCR")
80
84
 
81
- if self.config.ocr_backend == "tesseract":
82
- from kreuzberg._ocr._sync import process_batch_images_sync
83
- from kreuzberg._ocr._tesseract import TesseractConfig
84
-
85
- if isinstance(self.config.ocr_config, TesseractConfig):
86
- config = self.config.ocr_config
87
- else:
88
- config = TesseractConfig()
89
-
90
- results = process_batch_images_sync([str(path)], config, backend="tesseract")
91
- if results:
92
- result = results[0]
93
- return self._apply_quality_processing(result)
94
- return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
95
-
96
- if self.config.ocr_backend == "paddleocr":
97
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
98
- from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
85
+ backend = get_ocr_backend(self.config.ocr_backend)
99
86
 
87
+ if self.config.ocr_backend == "tesseract":
88
+ config = (
89
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
90
+ )
91
+ result = backend.process_file_sync(path, **config.__dict__)
92
+ elif self.config.ocr_backend == "paddleocr":
100
93
  paddle_config = (
101
94
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
102
95
  )
103
-
104
- result = paddle_process(path, paddle_config)
105
- return self._apply_quality_processing(result)
106
-
107
- if self.config.ocr_backend == "easyocr":
108
- from kreuzberg._ocr._easyocr import EasyOCRConfig
109
- from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
110
-
96
+ result = backend.process_file_sync(path, **paddle_config.__dict__)
97
+ elif self.config.ocr_backend == "easyocr":
111
98
  easy_config = (
112
99
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
113
100
  )
114
-
115
- result = easy_process(path, easy_config)
116
- return self._apply_quality_processing(result)
117
-
118
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
101
+ result = backend.process_file_sync(path, **easy_config.__dict__)
102
+ else:
103
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
104
+ return self._apply_quality_processing(result)
119
105
 
120
106
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
121
107
  if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
@@ -1,8 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import os
4
5
  import re
6
+ import subprocess
5
7
  import sys
8
+ import tempfile
6
9
  from json import JSONDecodeError, loads
7
10
  from pathlib import Path
8
11
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
203
206
  Returns:
204
207
  ExtractionResult with the extracted text and metadata.
205
208
  """
206
- import os
207
- import tempfile
208
- from pathlib import Path
209
-
210
209
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
211
210
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
212
211
 
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
579
578
 
580
579
  def _validate_pandoc_version_sync(self) -> None:
581
580
  """Synchronous version of _validate_pandoc_version."""
582
- import subprocess
583
-
584
581
  try:
585
582
  if self._checked_version:
586
583
  return
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
625
622
 
626
623
  def _extract_metadata_sync(self, path: Path) -> Metadata:
627
624
  """Synchronous version of _handle_extract_metadata."""
628
- import os
629
- import subprocess
630
- import tempfile
631
-
632
625
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
633
626
  fd, metadata_file = tempfile.mkstemp(suffix=".json")
634
627
  os.close(fd)
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
663
656
 
664
657
  def _extract_file_sync(self, path: Path) -> str:
665
658
  """Synchronous version of _handle_extract_file."""
666
- import os
667
- import subprocess
668
- import tempfile
669
-
670
659
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
671
660
  fd, output_path = tempfile.mkstemp(suffix=".md")
672
661
  os.close(fd)