kreuzberg 3.6.2__tar.gz → 3.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.gitignore +2 -0
  2. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/PKG-INFO +116 -48
  3. kreuzberg-3.8.0/README.md +236 -0
  4. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/pyproject.toml +4 -1
  5. kreuzberg-3.8.0/benchmarks/results/latest.json +607 -0
  6. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +36 -0
  7. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +145 -3
  8. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/models.py +60 -0
  9. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +127 -3
  10. kreuzberg-3.8.0/docs/index.md +58 -0
  11. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/docker.md +1 -1
  12. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/index.md +1 -0
  13. kreuzberg-3.8.0/docs/user-guide/mcp-server.md +586 -0
  14. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/_base.py +40 -0
  15. kreuzberg-3.8.0/kreuzberg/_extractors/_email.py +149 -0
  16. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/_html.py +15 -3
  17. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/_image.py +17 -18
  18. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/_pdf.py +68 -14
  19. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/_presentation.py +62 -10
  20. kreuzberg-3.8.0/kreuzberg/_extractors/_spread_sheet.py +358 -0
  21. kreuzberg-3.8.0/kreuzberg/_extractors/_structured.py +148 -0
  22. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_gmft.py +2 -2
  23. kreuzberg-3.8.0/kreuzberg/_mcp/__init__.py +5 -0
  24. kreuzberg-3.8.0/kreuzberg/_mcp/server.py +227 -0
  25. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_mime_types.py +27 -1
  26. kreuzberg-3.8.0/kreuzberg/_multiprocessing/__init__.py +5 -0
  27. kreuzberg-3.8.0/kreuzberg/_ocr/__init__.py +47 -0
  28. kreuzberg-3.6.2/kreuzberg/_multiprocessing/tesseract_pool.py → kreuzberg-3.8.0/kreuzberg/_ocr/_pool.py +3 -5
  29. kreuzberg-3.8.0/kreuzberg/_ocr/_sync.py +566 -0
  30. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_ocr/_tesseract.py +6 -2
  31. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_registry.py +4 -0
  32. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_types.py +131 -0
  33. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_cache.py +17 -2
  34. kreuzberg-3.6.2/kreuzberg/_multiprocessing/process_manager.py → kreuzberg-3.8.0/kreuzberg/_utils/_process_pool.py +90 -2
  35. kreuzberg-3.8.0/kreuzberg/_utils/_quality.py +237 -0
  36. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_serialization.py +4 -2
  37. kreuzberg-3.8.0/kreuzberg/_utils/_string.py +182 -0
  38. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_sync.py +5 -2
  39. kreuzberg-3.8.0/kreuzberg/_utils/_table.py +261 -0
  40. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/mkdocs.yaml +1 -0
  41. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/pyproject.toml +27 -4
  42. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/api/main_test.py +162 -2
  43. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extraction_test.py +12 -3
  44. kreuzberg-3.8.0/tests/extractors/email_comprehensive_test.py +326 -0
  45. kreuzberg-3.8.0/tests/extractors/email_test.py +31 -0
  46. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/image_test.py +22 -10
  47. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/pdf_test.py +2 -2
  48. kreuzberg-3.8.0/tests/extractors/structured_test.py +90 -0
  49. kreuzberg-3.8.0/tests/mcp_server_test.py +374 -0
  50. kreuzberg-3.8.0/tests/multiprocessing/gmft_isolated_test.py +488 -0
  51. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/multiprocessing/process_manager_test.py +1 -1
  52. kreuzberg-3.8.0/tests/multiprocessing/sync_easyocr_test.py +640 -0
  53. kreuzberg-3.8.0/tests/multiprocessing/sync_paddleocr_test.py +529 -0
  54. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/multiprocessing/sync_tesseract_test.py +29 -33
  55. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/multiprocessing/tesseract_pool_test.py +2 -2
  56. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/paddleocr_test.py +4 -4
  57. kreuzberg-3.8.0/tests/test_source_files/better-ocr-image.jpg +0 -0
  58. kreuzberg-3.8.0/tests/test_source_files/email/sample-email.eml +11 -0
  59. kreuzberg-3.8.0/tests/test_source_files/json/sample-document.json +1 -0
  60. kreuzberg-3.8.0/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  61. kreuzberg-3.8.0/tests/test_source_files/toml/sample-config.toml +33 -0
  62. kreuzberg-3.8.0/tests/test_source_files/yaml/sample-config.yaml +15 -0
  63. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/string_test.py +3 -3
  64. kreuzberg-3.8.0/tests/utils/table_test.py +413 -0
  65. kreuzberg-3.8.0/uv.lock +6637 -0
  66. kreuzberg-3.6.2/.gitmodules +0 -3
  67. kreuzberg-3.6.2/README.md +0 -173
  68. kreuzberg-3.6.2/docs/index.md +0 -15
  69. kreuzberg-3.6.2/kreuzberg/_extractors/_spread_sheet.py +0 -183
  70. kreuzberg-3.6.2/kreuzberg/_multiprocessing/__init__.py +0 -6
  71. kreuzberg-3.6.2/kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  72. kreuzberg-3.6.2/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  73. kreuzberg-3.6.2/kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  74. kreuzberg-3.6.2/kreuzberg/_ocr/__init__.py +0 -17
  75. kreuzberg-3.6.2/kreuzberg/_utils/_process_pool.py +0 -100
  76. kreuzberg-3.6.2/kreuzberg/_utils/_string.py +0 -39
  77. kreuzberg-3.6.2/uv.lock +0 -4200
  78. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.commitlintrc +0 -0
  79. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.docker/Dockerfile +0 -0
  80. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.docker/README.md +0 -0
  81. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.dockerignore +0 -0
  82. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.github/dependabot.yaml +0 -0
  83. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.github/workflows/ci.yaml +0 -0
  84. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.github/workflows/docs.yml +0 -0
  85. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.github/workflows/pr-title.yaml +0 -0
  86. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.github/workflows/publish-docker.yml +0 -0
  87. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.github/workflows/release.yaml +0 -0
  88. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.markdownlint.yaml +0 -0
  89. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/.pre-commit-config.yaml +0 -0
  90. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/LICENSE +0 -0
  91. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/ai-rulez.yaml +0 -0
  92. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/README.md +0 -0
  93. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/benchmark_baseline.py +0 -0
  94. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/end_to_end_benchmark.py +0 -0
  95. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/final_benchmark.py +0 -0
  96. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/baseline_results.json +0 -0
  97. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  98. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  99. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/final_benchmark_results.json +0 -0
  100. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/mime_caching_results.json +0 -0
  101. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  102. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/ocr_caching_results.json +0 -0
  103. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  104. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  105. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/results/table_caching_results.json +0 -0
  106. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/serialization_benchmark.py +0 -0
  107. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  108. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  109. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  110. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/benchmarks/statistical_benchmark.py +0 -0
  111. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/advanced/custom-extractors.md +0 -0
  112. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/advanced/custom-hooks.md +0 -0
  113. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/advanced/error-handling.md +0 -0
  114. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/advanced/index.md +0 -0
  115. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/advanced/performance.md +0 -0
  116. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/api-reference/exceptions.md +0 -0
  117. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/api-reference/extraction-functions.md +0 -0
  118. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/api-reference/extractor-registry.md +0 -0
  119. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/api-reference/index.md +0 -0
  120. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/api-reference/ocr-configuration.md +0 -0
  121. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/api-reference/types.md +0 -0
  122. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/assets/favicon.png +0 -0
  123. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/assets/logo.png +0 -0
  124. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/changelog.md +0 -0
  125. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/cli.md +0 -0
  126. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/contributing.md +0 -0
  127. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/css/extra.css +0 -0
  128. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/examples/extraction-examples.md +0 -0
  129. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/examples/index.md +0 -0
  130. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/getting-started/index.md +0 -0
  131. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/getting-started/installation.md +0 -0
  132. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/getting-started/quick-start.md +0 -0
  133. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/api-server.md +0 -0
  134. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/basic-usage.md +0 -0
  135. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/chunking.md +0 -0
  136. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/extraction-configuration.md +0 -0
  137. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/metadata-extraction.md +0 -0
  138. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/ocr-backends.md +0 -0
  139. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/ocr-configuration.md +0 -0
  140. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/docs/user-guide/supported-formats.md +0 -0
  141. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/__init__.py +0 -0
  142. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/__main__.py +0 -0
  143. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_api/__init__.py +0 -0
  144. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_api/main.py +0 -0
  145. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_chunker.py +0 -0
  146. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_cli_config.py +0 -0
  147. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_constants.py +0 -0
  148. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_entity_extraction.py +0 -0
  149. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/__init__.py +0 -0
  150. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  151. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_language_detection.py +0 -0
  152. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
  153. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_ocr/_base.py +0 -0
  154. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_ocr/_easyocr.py +0 -0
  155. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  156. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_playa.py +0 -0
  157. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/__init__.py +0 -0
  158. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_device.py +0 -0
  159. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_document_cache.py +0 -0
  160. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_errors.py +0 -0
  161. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  162. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/_utils/_tmp.py +0 -0
  163. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/cli.py +0 -0
  164. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/exceptions.py +0 -0
  165. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/extraction.py +0 -0
  166. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/kreuzberg/py.typed +0 -0
  167. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/__init__.py +0 -0
  168. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/api/__init__.py +0 -0
  169. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/chunker_test.py +0 -0
  170. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/cli_integration_test.py +0 -0
  171. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/cli_test.py +0 -0
  172. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/conftest.py +0 -0
  173. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/entity_extraction_test.py +0 -0
  174. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/exceptions_test.py +0 -0
  175. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extraction_batch_test.py +0 -0
  176. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/__init__.py +0 -0
  177. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/html_test.py +0 -0
  178. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  179. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/pandoc_test.py +0 -0
  180. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/presentation_test.py +0 -0
  181. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/extractors/spreed_sheet_test.py +0 -0
  182. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/gmft_extended_test.py +0 -0
  183. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/gmft_test.py +0 -0
  184. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/hooks_test.py +0 -0
  185. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/language_detection_test.py +0 -0
  186. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/mime_types_test.py +0 -0
  187. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/multiprocessing/__init__.py +0 -0
  188. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  189. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/__init__.py +0 -0
  190. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/base_test.py +0 -0
  191. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/device_integration_test.py +0 -0
  192. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/easyocr_test.py +0 -0
  193. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/init_test.py +0 -0
  194. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/ocr/tesseract_test.py +0 -0
  195. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/playa_test.py +0 -0
  196. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/registry_test.py +0 -0
  197. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/document.docx +0 -0
  198. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  199. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/excel.xlsx +0 -0
  200. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/french-text.txt +0 -0
  201. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/german-text.txt +0 -0
  202. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/html.html +0 -0
  203. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/markdown.md +0 -0
  204. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  205. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/non-searchable.pdf +0 -0
  206. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/ocr-image.jpg +0 -0
  207. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  208. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  209. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  210. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  211. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/sample-contract.pdf +0 -0
  212. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/scanned.pdf +0 -0
  213. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/searchable.pdf +0 -0
  214. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/spanish-text.txt +0 -0
  215. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/test_source_files/test-article.pdf +0 -0
  216. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/types_test.py +0 -0
  217. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/__init__.py +0 -0
  218. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/cache_test.py +0 -0
  219. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/device_test.py +0 -0
  220. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/errors_test.py +0 -0
  221. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/pdf_lock_test.py +0 -0
  222. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/process_pool_test.py +0 -0
  223. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/serialization_test.py +0 -0
  224. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/sync_test.py +0 -0
  225. {kreuzberg-3.6.2 → kreuzberg-3.8.0}/tests/utils/tmp_test.py +0 -0
@@ -33,3 +33,5 @@ GEMINI.md
33
33
  prompt_template.egg-info/
34
34
  requirements.txt
35
35
  site/
36
+ .cache/
37
+ dist/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.6.2
3
+ Version: 3.8.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -23,9 +23,10 @@ Classifier: Topic :: Utilities
23
23
  Classifier: Typing :: Typed
24
24
  Requires-Python: >=3.10
25
25
  Requires-Dist: anyio>=4.9.0
26
- Requires-Dist: charset-normalizer>=3.4.2
26
+ Requires-Dist: chardetng-py>=0.3.4
27
27
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
28
- Requires-Dist: html-to-markdown[lxml]>=1.6.0
28
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
29
+ Requires-Dist: mcp>=1.11.0
29
30
  Requires-Dist: msgspec>=0.18.0
30
31
  Requires-Dist: playa-pdf>=0.6.1
31
32
  Requires-Dist: psutil>=7.0.0
@@ -33,6 +34,9 @@ Requires-Dist: pypdfium2==4.30.0
33
34
  Requires-Dist: python-calamine>=0.3.2
34
35
  Requires-Dist: python-pptx>=1.0.2
35
36
  Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
37
+ Provides-Extra: additional-extensions
38
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
39
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
36
40
  Provides-Extra: all
37
41
  Requires-Dist: click>=8.2.1; extra == 'all'
38
42
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
@@ -40,6 +44,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
40
44
  Requires-Dist: gmft>=0.4.2; extra == 'all'
41
45
  Requires-Dist: keybert>=0.9.0; extra == 'all'
42
46
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
47
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
43
48
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
44
49
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
45
50
  Requires-Dist: rich>=14.0.0; extra == 'all'
@@ -76,21 +81,51 @@ Description-Content-Type: text/markdown
76
81
  [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
77
82
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
78
83
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
84
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
79
85
 
80
- **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
86
+ **High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
81
87
 
82
88
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
83
89
 
84
- ## Why Kreuzberg?
90
+ ## Why Choose Kreuzberg?
85
91
 
86
- - **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
87
- - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
88
- - **⚡ Dual APIs**: Only library with both sync and async support
89
- - **🔧 Zero Configuration**: Works out of the box with sane defaults
90
- - **🏠 Local Processing**: No cloud dependencies or external API calls
91
- - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
92
- - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
93
- - **🐳 Production Ready**: CLI, REST API, and Docker images included
92
+ ### 🚀 Performance
93
+
94
+ - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
95
+ - Minimal footprint: 71MB install vs 1GB+ for competitors
96
+ - Lowest memory usage (~530MB average) optimized for production workloads
97
+ - Edge and serverless ready - deploy anywhere without heavy dependencies
98
+
99
+ ### 🛠️ Engineering Quality
100
+
101
+ - Built by software engineers with modern Python best practices
102
+ - 95%+ test coverage with comprehensive test suite
103
+ - Thoroughly benchmarked and profiled for real-world performance
104
+ - Only framework offering true async/await support alongside sync APIs
105
+ - Robust error handling and detailed logging
106
+
107
+ ### 🎯 Developer Experience
108
+
109
+ - Works out of the box with sane defaults, scales with your needs
110
+ - Native MCP server for AI tool integration (Claude Desktop, Cursor)
111
+ - Full type safety with excellent IDE support (completions)
112
+ - Comprehensive documentation including full API reference
113
+
114
+ ### 🌍 Deployment Options
115
+
116
+ - Docker images for all architectures (AMD64, ARM64)
117
+ - Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
118
+ - CPU-only processing - no GPU requirements, lower energy consumption
119
+ - 100% local processing - no external API dependencies
120
+ - Multiple deployment modes: CLI, REST API, MCP server
121
+
122
+ ### 🎯 Complete Solution
123
+
124
+ - Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
125
+ - Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
126
+ - Advanced features: Table extraction, metadata extraction, content chunking for RAG
127
+ - Production tools: REST API, CLI tools, batch processing, custom extractors
128
+ - Fully extensible: Add your own extractors
94
129
 
95
130
  ## Quick Start
96
131
 
@@ -136,6 +171,55 @@ asyncio.run(main())
136
171
 
137
172
  ## Deployment Options
138
173
 
174
+ ### 🤖 MCP Server (AI Integration)
175
+
176
+ **Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
177
+
178
+ ```bash
179
+ # Install and run MCP server with all features (recommended)
180
+ pip install "kreuzberg[all]"
181
+ kreuzberg-mcp
182
+
183
+ # Or with uvx (recommended for Claude Desktop)
184
+ uvx --with "kreuzberg[all]" kreuzberg-mcp
185
+
186
+ # Basic installation (core features only)
187
+ pip install kreuzberg
188
+ kreuzberg-mcp
189
+ ```
190
+
191
+ **Configure in Claude Desktop (`claude_desktop_config.json`):**
192
+
193
+ ```json
194
+ {
195
+ "mcpServers": {
196
+ "kreuzberg": {
197
+ "command": "uvx",
198
+ "args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
199
+ }
200
+ }
201
+ }
202
+ ```
203
+
204
+ **Basic configuration (core features only):**
205
+
206
+ ```json
207
+ {
208
+ "mcpServers": {
209
+ "kreuzberg": {
210
+ "command": "uvx",
211
+ "args": ["kreuzberg-mcp"]
212
+ }
213
+ }
214
+ }
215
+ ```
216
+
217
+ **Available MCP capabilities:**
218
+
219
+ - **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
220
+ - **Resources**: Configuration, supported formats, OCR backends
221
+ - **Prompts**: Extract-and-summarize, structured analysis workflows
222
+
139
223
  ### 🐳 Docker (Recommended)
140
224
 
141
225
  ```bash
@@ -146,7 +230,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
146
230
  curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
147
231
  ```
148
232
 
149
- Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
233
+ Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
150
234
 
151
235
  ### 🌐 REST API
152
236
 
@@ -189,23 +273,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
189
273
  | **Web** | HTML, XML, MHTML |
190
274
  | **Archives** | Support via extraction |
191
275
 
192
- ## Performance
276
+ ## 📊 Performance Comparison
277
+
278
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
193
279
 
194
- **[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
280
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
281
+ | ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
282
+ | **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
283
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
284
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
285
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
195
286
 
196
- | Library | Speed | Memory | Install Size | Dependencies | Success Rate |
197
- | ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
198
- | **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
199
- | Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
200
- | MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
201
- | Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
287
+ \*_Performance varies significantly with document complexity and size_
202
288
 
203
- \*_Can achieve 75% reliability with 15% performance trade-off when configured_
204
- †_Good on simple documents, struggles with large/complex files (>10MB)_
205
- ‡_Frequently fails/times out on medium files (>1MB)_
289
+ **Key strengths:**
206
290
 
207
- > **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
208
- > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
291
+ - 2-3x faster processing than comparable frameworks
292
+ - Smallest installation footprint and memory usage
293
+ - Only framework with built-in async/await support
294
+ - CPU-only processing - no GPU dependencies
295
+ - Built by software engineers for production reliability
296
+
297
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
209
298
 
210
299
  ## Documentation
211
300
 
@@ -219,27 +308,6 @@ kreuzberg extract *.pdf --output-dir ./extracted/
219
308
  - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
220
309
  - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
221
310
 
222
- ## Advanced Features
223
-
224
- - **📊 Table Extraction**: Extract tables from PDFs with GMFT
225
- - **🧩 Content Chunking**: Split documents for RAG applications
226
- - **🎯 Custom Extractors**: Extend with your own document handlers
227
- - **🔧 Configuration**: Flexible TOML-based configuration
228
- - **🪝 Hooks**: Pre/post-processing customization
229
- - **🌍 Multi-language OCR**: 100+ languages supported
230
- - **⚙️ Metadata Extraction**: Rich document metadata
231
- - **🔄 Batch Processing**: Efficient bulk document processing
232
-
233
311
  ## License
234
312
 
235
313
  MIT License - see [LICENSE](LICENSE) for details.
236
-
237
- ______________________________________________________________________
238
-
239
- <div align="center">
240
-
241
- **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
242
-
243
- Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
244
-
245
- </div>
@@ -0,0 +1,236 @@
1
+ # Kreuzberg
2
+
3
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
4
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
5
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
8
+
9
+ **High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
10
+
11
+ 📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
12
+
13
+ ## Why Choose Kreuzberg?
14
+
15
+ ### 🚀 Performance
16
+
17
+ - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
18
+ - Minimal footprint: 71MB install vs 1GB+ for competitors
19
+ - Lowest memory usage (~530MB average) optimized for production workloads
20
+ - Edge and serverless ready - deploy anywhere without heavy dependencies
21
+
22
+ ### 🛠️ Engineering Quality
23
+
24
+ - Built by software engineers with modern Python best practices
25
+ - 95%+ test coverage with comprehensive test suite
26
+ - Thoroughly benchmarked and profiled for real-world performance
27
+ - Only framework offering true async/await support alongside sync APIs
28
+ - Robust error handling and detailed logging
29
+
30
+ ### 🎯 Developer Experience
31
+
32
+ - Works out of the box with sane defaults, scales with your needs
33
+ - Native MCP server for AI tool integration (Claude Desktop, Cursor)
34
+ - Full type safety with excellent IDE support (completions)
35
+ - Comprehensive documentation including full API reference
36
+
37
+ ### 🌍 Deployment Options
38
+
39
+ - Docker images for all architectures (AMD64, ARM64)
40
+ - Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
41
+ - CPU-only processing - no GPU requirements, lower energy consumption
42
+ - 100% local processing - no external API dependencies
43
+ - Multiple deployment modes: CLI, REST API, MCP server
44
+
45
+ ### 🎯 Complete Solution
46
+
47
+ - Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
48
+ - Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
49
+ - Advanced features: Table extraction, metadata extraction, content chunking for RAG
50
+ - Production tools: REST API, CLI tools, batch processing, custom extractors
51
+ - Fully extensible: Add your own extractors
52
+
53
+ ## Quick Start
54
+
55
+ ### Installation
56
+
57
+ ```bash
58
+ # Basic installation
59
+ pip install kreuzberg
60
+
61
+ # With optional features
62
+ pip install "kreuzberg[cli,api]" # CLI + REST API
63
+ pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
64
+ pip install "kreuzberg[all]" # Everything
65
+ ```
66
+
67
+ ### System Dependencies
68
+
69
+ ```bash
70
+ # Ubuntu/Debian
71
+ sudo apt-get install tesseract-ocr pandoc
72
+
73
+ # macOS
74
+ brew install tesseract pandoc
75
+
76
+ # Windows
77
+ choco install tesseract pandoc
78
+ ```
79
+
80
+ ### Basic Usage
81
+
82
+ ```python
83
+ import asyncio
84
+ from kreuzberg import extract_file
85
+
86
+ async def main():
87
+ # Extract from any document type
88
+ result = await extract_file("document.pdf")
89
+ print(result.content)
90
+ print(result.metadata)
91
+
92
+ asyncio.run(main())
93
+ ```
94
+
95
+ ## Deployment Options
96
+
97
+ ### 🤖 MCP Server (AI Integration)
98
+
99
+ **Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
100
+
101
+ ```bash
102
+ # Install and run MCP server with all features (recommended)
103
+ pip install "kreuzberg[all]"
104
+ kreuzberg-mcp
105
+
106
+ # Or with uvx (recommended for Claude Desktop)
107
+ uvx --with "kreuzberg[all]" kreuzberg-mcp
108
+
109
+ # Basic installation (core features only)
110
+ pip install kreuzberg
111
+ kreuzberg-mcp
112
+ ```
113
+
114
+ **Configure in Claude Desktop (`claude_desktop_config.json`):**
115
+
116
+ ```json
117
+ {
118
+ "mcpServers": {
119
+ "kreuzberg": {
120
+ "command": "uvx",
121
+ "args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
122
+ }
123
+ }
124
+ }
125
+ ```
126
+
127
+ **Basic configuration (core features only):**
128
+
129
+ ```json
130
+ {
131
+ "mcpServers": {
132
+ "kreuzberg": {
133
+ "command": "uvx",
134
+ "args": ["kreuzberg-mcp"]
135
+ }
136
+ }
137
+ }
138
+ ```
139
+
140
+ **Available MCP capabilities:**
141
+
142
+ - **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
143
+ - **Resources**: Configuration, supported formats, OCR backends
144
+ - **Prompts**: Extract-and-summarize, structured analysis workflows
145
+
146
+ ### 🐳 Docker (Recommended)
147
+
148
+ ```bash
149
+ # Run API server
150
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
151
+
152
+ # Extract files
153
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
154
+ ```
155
+
156
+ Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
157
+
158
+ ### 🌐 REST API
159
+
160
+ ```bash
161
+ # Install and run
162
+ pip install "kreuzberg[api]"
163
+ litestar --app kreuzberg._api.main:app run
164
+
165
+ # Health check
166
+ curl http://localhost:8000/health
167
+
168
+ # Extract files
169
+ curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
170
+ ```
171
+
172
+ ### 💻 Command Line
173
+
174
+ ```bash
175
+ # Install CLI
176
+ pip install "kreuzberg[cli]"
177
+
178
+ # Extract to stdout
179
+ kreuzberg extract document.pdf
180
+
181
+ # JSON output with metadata
182
+ kreuzberg extract document.pdf --output-format json --show-metadata
183
+
184
+ # Batch processing
185
+ kreuzberg extract *.pdf --output-dir ./extracted/
186
+ ```
187
+
188
+ ## Supported Formats
189
+
190
+ | Category | Formats |
191
+ | ----------------- | ------------------------------ |
192
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
193
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
194
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
195
+ | **Presentations** | PPTX, PPT, ODP |
196
+ | **Web** | HTML, XML, MHTML |
197
+ | **Archives** | Support via extraction |
198
+
199
+ ## 📊 Performance Comparison
200
+
201
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
202
+
203
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
204
+ | ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
205
+ | **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
206
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
207
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
208
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
209
+
210
+ \*_Performance varies significantly with document complexity and size_
211
+
212
+ **Key strengths:**
213
+
214
+ - 2-3x faster processing than comparable frameworks
215
+ - Smallest installation footprint and memory usage
216
+ - Only framework with built-in async/await support
217
+ - CPU-only processing - no GPU dependencies
218
+ - Built by software engineers for production reliability
219
+
220
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
221
+
222
+ ## Documentation
223
+
224
+ ### Quick Links
225
+
226
+ - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
227
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
228
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
229
+ - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
230
+ - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
231
+ - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
232
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
233
+
234
+ ## License
235
+
236
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -3,9 +3,12 @@ name = "kreuzberg-benchmarks"
3
3
  version = "0.1.0"
4
4
  description = "Performance benchmarking suite for Kreuzberg text extraction library"
5
5
  readme = "README.md"
6
- requires-python = ">=3.13"
6
+ requires-python = ">=3.10"
7
7
  classifiers = [
8
8
  "Programming Language :: Python :: 3 :: Only",
9
+ "Programming Language :: Python :: 3.10",
10
+ "Programming Language :: Python :: 3.11",
11
+ "Programming Language :: Python :: 3.12",
9
12
  "Programming Language :: Python :: 3.13",
10
13
  ]
11
14
  dependencies = [