kreuzberg 3.7.0__tar.gz → 3.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.gitignore +2 -0
  2. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/PKG-INFO +58 -54
  3. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/README.md +43 -48
  4. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/pyproject.toml +4 -1
  5. kreuzberg-3.8.1/benchmarks/results/latest.json +607 -0
  6. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +36 -0
  7. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +145 -3
  8. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/models.py +60 -0
  9. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +127 -3
  10. kreuzberg-3.8.1/docs/index.md +54 -0
  11. kreuzberg-3.8.1/docs/performance-analysis.md +140 -0
  12. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/docker.md +1 -1
  13. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/mcp-server.md +15 -0
  14. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_entity_extraction.py +1 -2
  15. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_base.py +39 -1
  16. kreuzberg-3.8.1/kreuzberg/_extractors/_email.py +149 -0
  17. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_html.py +15 -3
  18. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_image.py +21 -36
  19. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pandoc.py +3 -14
  20. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pdf.py +81 -48
  21. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_presentation.py +62 -10
  22. kreuzberg-3.8.1/kreuzberg/_extractors/_spread_sheet.py +358 -0
  23. kreuzberg-3.8.1/kreuzberg/_extractors/_structured.py +148 -0
  24. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_gmft.py +314 -7
  25. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_mime_types.py +27 -1
  26. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/__init__.py +10 -1
  27. kreuzberg-3.8.1/kreuzberg/_ocr/_base.py +113 -0
  28. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_easyocr.py +91 -0
  29. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_paddleocr.py +89 -0
  30. kreuzberg-3.8.1/kreuzberg/_ocr/_tesseract.py +996 -0
  31. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_registry.py +4 -0
  32. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_types.py +131 -0
  33. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_cache.py +52 -4
  34. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_errors.py +3 -7
  35. kreuzberg-3.7.0/kreuzberg/_multiprocessing/process_manager.py → kreuzberg-3.8.1/kreuzberg/_utils/_process_pool.py +86 -2
  36. kreuzberg-3.8.1/kreuzberg/_utils/_quality.py +237 -0
  37. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_serialization.py +4 -2
  38. kreuzberg-3.8.1/kreuzberg/_utils/_string.py +182 -0
  39. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_sync.py +5 -2
  40. kreuzberg-3.8.1/kreuzberg/_utils/_table.py +261 -0
  41. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/cli.py +1 -2
  42. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/extraction.py +4 -22
  43. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/mkdocs.yaml +1 -0
  44. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/pyproject.toml +41 -15
  45. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/api/main_test.py +162 -2
  46. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extraction_batch_test.py +4 -4
  47. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extraction_test.py +12 -3
  48. kreuzberg-3.8.1/tests/extractors/email_comprehensive_test.py +326 -0
  49. kreuzberg-3.8.1/tests/extractors/email_test.py +31 -0
  50. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/image_test.py +64 -69
  51. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/pdf_test.py +2 -2
  52. kreuzberg-3.8.1/tests/extractors/structured_test.py +90 -0
  53. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/gmft_test.py +15 -2
  54. kreuzberg-3.8.1/tests/multiprocessing/gmft_isolated_test.py +489 -0
  55. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/process_manager_test.py +1 -1
  56. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/tesseract_pool_test.py +4 -4
  57. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/base_test.py +14 -0
  58. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/easyocr_test.py +36 -0
  59. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/paddleocr_test.py +54 -4
  60. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/tesseract_test.py +44 -0
  61. kreuzberg-3.8.1/tests/test_source_files/better-ocr-image.jpg +0 -0
  62. kreuzberg-3.8.1/tests/test_source_files/email/sample-email.eml +11 -0
  63. kreuzberg-3.8.1/tests/test_source_files/json/sample-document.json +1 -0
  64. kreuzberg-3.8.1/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  65. kreuzberg-3.8.1/tests/test_source_files/toml/sample-config.toml +33 -0
  66. kreuzberg-3.8.1/tests/test_source_files/yaml/sample-config.yaml +15 -0
  67. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/process_pool_test.py +1 -1
  68. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/string_test.py +3 -3
  69. kreuzberg-3.8.1/tests/utils/table_test.py +413 -0
  70. kreuzberg-3.8.1/uv.lock +6637 -0
  71. kreuzberg-3.7.0/.gitmodules +0 -3
  72. kreuzberg-3.7.0/docs/index.md +0 -16
  73. kreuzberg-3.7.0/kreuzberg/_extractors/_spread_sheet.py +0 -183
  74. kreuzberg-3.7.0/kreuzberg/_multiprocessing/__init__.py +0 -6
  75. kreuzberg-3.7.0/kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  76. kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  77. kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  78. kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  79. kreuzberg-3.7.0/kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  80. kreuzberg-3.7.0/kreuzberg/_ocr/_base.py +0 -54
  81. kreuzberg-3.7.0/kreuzberg/_ocr/_tesseract.py +0 -436
  82. kreuzberg-3.7.0/kreuzberg/_utils/_process_pool.py +0 -100
  83. kreuzberg-3.7.0/kreuzberg/_utils/_string.py +0 -39
  84. kreuzberg-3.7.0/tests/multiprocessing/sync_tesseract_test.py +0 -366
  85. kreuzberg-3.7.0/uv.lock +0 -4369
  86. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.commitlintrc +0 -0
  87. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.docker/Dockerfile +0 -0
  88. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.docker/README.md +0 -0
  89. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.dockerignore +0 -0
  90. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/dependabot.yaml +0 -0
  91. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/ci.yaml +0 -0
  92. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/docs.yml +0 -0
  93. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/pr-title.yaml +0 -0
  94. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/publish-docker.yml +0 -0
  95. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/release.yaml +0 -0
  96. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.markdownlint.yaml +0 -0
  97. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.pre-commit-config.yaml +0 -0
  98. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/LICENSE +0 -0
  99. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/ai-rulez.yaml +0 -0
  100. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/README.md +0 -0
  101. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/benchmark_baseline.py +0 -0
  102. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/end_to_end_benchmark.py +0 -0
  103. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/final_benchmark.py +0 -0
  104. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/baseline_results.json +0 -0
  105. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  106. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  107. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/final_benchmark_results.json +0 -0
  108. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/mime_caching_results.json +0 -0
  109. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  110. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/ocr_caching_results.json +0 -0
  111. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  112. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  113. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/table_caching_results.json +0 -0
  114. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/serialization_benchmark.py +0 -0
  115. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  116. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  117. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  118. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/statistical_benchmark.py +0 -0
  119. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/custom-extractors.md +0 -0
  120. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/custom-hooks.md +0 -0
  121. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/error-handling.md +0 -0
  122. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/index.md +0 -0
  123. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/performance.md +0 -0
  124. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/exceptions.md +0 -0
  125. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/extraction-functions.md +0 -0
  126. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/extractor-registry.md +0 -0
  127. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/index.md +0 -0
  128. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/ocr-configuration.md +0 -0
  129. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/types.md +0 -0
  130. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/assets/favicon.png +0 -0
  131. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/assets/logo.png +0 -0
  132. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/changelog.md +0 -0
  133. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/cli.md +0 -0
  134. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/contributing.md +0 -0
  135. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/css/extra.css +0 -0
  136. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/examples/extraction-examples.md +0 -0
  137. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/examples/index.md +0 -0
  138. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/getting-started/index.md +0 -0
  139. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/getting-started/installation.md +0 -0
  140. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/getting-started/quick-start.md +0 -0
  141. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/api-server.md +0 -0
  142. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/basic-usage.md +0 -0
  143. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/chunking.md +0 -0
  144. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/extraction-configuration.md +0 -0
  145. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/index.md +0 -0
  146. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/metadata-extraction.md +0 -0
  147. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-backends.md +0 -0
  148. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-configuration.md +0 -0
  149. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/supported-formats.md +0 -0
  150. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/__init__.py +0 -0
  151. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/__main__.py +0 -0
  152. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_api/__init__.py +0 -0
  153. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_api/main.py +0 -0
  154. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_chunker.py +0 -0
  155. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_cli_config.py +0 -0
  156. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_constants.py +0 -0
  157. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/__init__.py +0 -0
  158. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_language_detection.py +0 -0
  159. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/__init__.py +0 -0
  160. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/server.py +0 -0
  161. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_playa.py +0 -0
  162. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/__init__.py +0 -0
  163. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_device.py +0 -0
  164. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_document_cache.py +0 -0
  165. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  166. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_tmp.py +0 -0
  167. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/exceptions.py +0 -0
  168. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/py.typed +0 -0
  169. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/__init__.py +0 -0
  170. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/api/__init__.py +0 -0
  171. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/chunker_test.py +0 -0
  172. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/cli_integration_test.py +0 -0
  173. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/cli_test.py +0 -0
  174. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/conftest.py +0 -0
  175. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/entity_extraction_test.py +0 -0
  176. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/exceptions_test.py +0 -0
  177. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/__init__.py +0 -0
  178. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/html_test.py +0 -0
  179. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  180. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_test.py +0 -0
  181. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/presentation_test.py +0 -0
  182. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/spreed_sheet_test.py +0 -0
  183. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/gmft_extended_test.py +0 -0
  184. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/hooks_test.py +0 -0
  185. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/language_detection_test.py +0 -0
  186. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/mcp_server_test.py +0 -0
  187. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/mime_types_test.py +0 -0
  188. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/__init__.py +0 -0
  189. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
  190. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/__init__.py +0 -0
  191. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/device_integration_test.py +0 -0
  192. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/init_test.py +0 -0
  193. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/playa_test.py +0 -0
  194. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/registry_test.py +0 -0
  195. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/document.docx +0 -0
  196. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  197. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/excel.xlsx +0 -0
  198. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/french-text.txt +0 -0
  199. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/german-text.txt +0 -0
  200. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/html.html +0 -0
  201. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/markdown.md +0 -0
  202. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  203. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/non-searchable.pdf +0 -0
  204. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/ocr-image.jpg +0 -0
  205. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  206. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  207. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  208. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  209. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/sample-contract.pdf +0 -0
  210. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/scanned.pdf +0 -0
  211. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/searchable.pdf +0 -0
  212. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/spanish-text.txt +0 -0
  213. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/test-article.pdf +0 -0
  214. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/types_test.py +0 -0
  215. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/__init__.py +0 -0
  216. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/cache_test.py +0 -0
  217. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/device_test.py +0 -0
  218. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/errors_test.py +0 -0
  219. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/pdf_lock_test.py +0 -0
  220. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/serialization_test.py +0 -0
  221. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/sync_test.py +0 -0
  222. {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/tmp_test.py +0 -0
@@ -33,3 +33,5 @@ GEMINI.md
33
33
  prompt_template.egg-info/
34
34
  requirements.txt
35
35
  site/
36
+ .cache/
37
+ dist/
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.7.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
3
+ Version: 3.8.1
4
+ Summary: Advanced document intelligence framework for extracting structured content from PDFs, images, and office documents
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
7
  License: MIT
8
8
  License-File: LICENSE
9
- Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
9
+ Keywords: automation,content-extraction,data-processing,document-analysis,document-intelligence,document-processing,entity-extraction,image-to-text,information-extraction,ocr,pdf-extraction,rag,structured-data,table-extraction,text-extraction
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: Intended Audience :: Science/Research
12
14
  Classifier: License :: OSI Approved :: MIT License
13
15
  Classifier: Operating System :: OS Independent
14
16
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -16,16 +18,19 @@ Classifier: Programming Language :: Python :: 3.10
16
18
  Classifier: Programming Language :: Python :: 3.11
17
19
  Classifier: Programming Language :: Python :: 3.12
18
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
23
+ Classifier: Topic :: Office/Business :: Office Suites
19
24
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
27
  Classifier: Topic :: Text Processing :: General
22
- Classifier: Topic :: Utilities
23
28
  Classifier: Typing :: Typed
24
29
  Requires-Python: >=3.10
25
30
  Requires-Dist: anyio>=4.9.0
26
- Requires-Dist: charset-normalizer>=3.4.2
31
+ Requires-Dist: chardetng-py>=0.3.4
27
32
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
28
- Requires-Dist: html-to-markdown[lxml]>=1.6.0
33
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
29
34
  Requires-Dist: mcp>=1.11.0
30
35
  Requires-Dist: msgspec>=0.18.0
31
36
  Requires-Dist: playa-pdf>=0.6.1
@@ -34,6 +39,9 @@ Requires-Dist: pypdfium2==4.30.0
34
39
  Requires-Dist: python-calamine>=0.3.2
35
40
  Requires-Dist: python-pptx>=1.0.2
36
41
  Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
42
+ Provides-Extra: additional-extensions
43
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
44
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
37
45
  Provides-Extra: all
38
46
  Requires-Dist: click>=8.2.1; extra == 'all'
39
47
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
@@ -41,6 +49,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
41
49
  Requires-Dist: gmft>=0.4.2; extra == 'all'
42
50
  Requires-Dist: keybert>=0.9.0; extra == 'all'
43
51
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
52
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
44
53
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
45
54
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
46
55
  Requires-Dist: rich>=14.0.0; extra == 'all'
@@ -77,22 +86,33 @@ Description-Content-Type: text/markdown
77
86
  [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
78
87
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
79
88
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
89
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
80
90
 
81
- **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
91
+ **Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
82
92
 
83
93
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
84
94
 
85
- ## Why Kreuzberg?
95
+ ## Why Choose Kreuzberg?
86
96
 
87
- - **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
88
- - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
89
- - **⚡ Dual APIs**: Only library with both sync and async support
90
- - **🔧 Zero Configuration**: Works out of the box with sane defaults
91
- - **🏠 Local Processing**: No cloud dependencies or external API calls
92
- - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
93
- - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
94
- - **🤖 AI Integration**: Native MCP server for Claude and other AI tools
95
- - **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
97
+ ### Proven Performance
98
+
99
+ [Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
100
+
101
+ ### 🏗️ Production Engineering
102
+
103
+ Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
104
+
105
+ ### 🔧 Developer Experience
106
+
107
+ Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
108
+
109
+ ### 🚀 Flexible Deployment
110
+
111
+ Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
112
+
113
+ ### 📄 Comprehensive Format Support
114
+
115
+ Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
96
116
 
97
117
  ## Quick Start
98
118
 
@@ -128,7 +148,7 @@ import asyncio
128
148
  from kreuzberg import extract_file
129
149
 
130
150
  async def main():
131
- # Extract from any document type
151
+ # Extract content from files
132
152
  result = await extract_file("document.pdf")
133
153
  print(result.content)
134
154
  print(result.metadata)
@@ -197,7 +217,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
197
217
  curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
198
218
  ```
199
219
 
200
- Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
220
+ Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
201
221
 
202
222
  ### 🌐 REST API
203
223
 
@@ -240,23 +260,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
240
260
  | **Web** | HTML, XML, MHTML |
241
261
  | **Archives** | Support via extraction |
242
262
 
243
- ## Performance
263
+ ## 📊 Performance Comparison
244
264
 
245
- **[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
265
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
246
266
 
247
- | Library | Speed | Memory | Install Size | Dependencies | Success Rate |
248
- | ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
249
- | **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
250
- | Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
251
- | MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
252
- | Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
267
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
268
+ | ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
269
+ | **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
270
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
271
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
272
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
253
273
 
254
- \*_Can achieve 75% reliability with 15% performance trade-off when configured_
255
- †_Good on simple documents, struggles with large/complex files (>10MB)_
256
- ‡_Frequently fails/times out on medium files (>1MB)_
274
+ \*_Performance varies significantly with document complexity and size_
257
275
 
258
- > **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
259
- > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
276
+ **Key strengths:**
277
+
278
+ - 6-126x faster processing than comparable frameworks
279
+ - Smallest installation footprint and memory usage
280
+ - Only framework with built-in async/await support
281
+ - Supports both CPU and GPU processing
282
+ - Built by software engineers for production reliability
283
+
284
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
260
285
 
261
286
  ## Documentation
262
287
 
@@ -264,34 +289,13 @@ kreuzberg extract *.pdf --output-dir ./extracted/
264
289
 
265
290
  - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
266
291
  - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
292
+ - [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
267
293
  - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
268
294
  - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
269
295
  - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
270
296
  - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
271
297
  - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
272
298
 
273
- ## Advanced Features
274
-
275
- - **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
276
- - **📊 Table Extraction**: Extract tables from PDFs with GMFT
277
- - **🧩 Content Chunking**: Split documents for RAG applications
278
- - **🎯 Custom Extractors**: Extend with your own document handlers
279
- - **🔧 Configuration**: Flexible TOML-based configuration
280
- - **🪝 Hooks**: Pre/post-processing customization
281
- - **🌍 Multi-language OCR**: 100+ languages supported
282
- - **⚙️ Metadata Extraction**: Rich document metadata
283
- - **🔄 Batch Processing**: Efficient bulk document processing
284
-
285
299
  ## License
286
300
 
287
301
  MIT License - see [LICENSE](LICENSE) for details.
288
-
289
- ______________________________________________________________________
290
-
291
- <div align="center">
292
-
293
- **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
294
-
295
- Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
296
-
297
- </div>
@@ -4,22 +4,33 @@
4
4
  [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
5
5
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
7
8
 
8
- **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
9
+ **Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
9
10
 
10
11
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
11
12
 
12
- ## Why Kreuzberg?
13
+ ## Why Choose Kreuzberg?
13
14
 
14
- - **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
15
- - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
16
- - **⚡ Dual APIs**: Only library with both sync and async support
17
- - **🔧 Zero Configuration**: Works out of the box with sane defaults
18
- - **🏠 Local Processing**: No cloud dependencies or external API calls
19
- - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
20
- - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
21
- - **🤖 AI Integration**: Native MCP server for Claude and other AI tools
22
- - **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
15
+ ### Proven Performance
16
+
17
+ [Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
18
+
19
+ ### 🏗️ Production Engineering
20
+
21
+ Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
22
+
23
+ ### 🔧 Developer Experience
24
+
25
+ Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
26
+
27
+ ### 🚀 Flexible Deployment
28
+
29
+ Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
30
+
31
+ ### 📄 Comprehensive Format Support
32
+
33
+ Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
23
34
 
24
35
  ## Quick Start
25
36
 
@@ -55,7 +66,7 @@ import asyncio
55
66
  from kreuzberg import extract_file
56
67
 
57
68
  async def main():
58
- # Extract from any document type
69
+ # Extract content from files
59
70
  result = await extract_file("document.pdf")
60
71
  print(result.content)
61
72
  print(result.metadata)
@@ -124,7 +135,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
124
135
  curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
125
136
  ```
126
137
 
127
- Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
138
+ Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
128
139
 
129
140
  ### 🌐 REST API
130
141
 
@@ -167,23 +178,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
167
178
  | **Web** | HTML, XML, MHTML |
168
179
  | **Archives** | Support via extraction |
169
180
 
170
- ## Performance
181
+ ## 📊 Performance Comparison
171
182
 
172
- **[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
183
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
173
184
 
174
- | Library | Speed | Memory | Install Size | Dependencies | Success Rate |
175
- | ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
176
- | **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
177
- | Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
178
- | MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
179
- | Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
185
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
186
+ | ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
187
+ | **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
188
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
189
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
190
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
180
191
 
181
- \*_Can achieve 75% reliability with 15% performance trade-off when configured_
182
- †_Good on simple documents, struggles with large/complex files (>10MB)_
183
- ‡_Frequently fails/times out on medium files (>1MB)_
192
+ \*_Performance varies significantly with document complexity and size_
184
193
 
185
- > **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
186
- > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
194
+ **Key strengths:**
195
+
196
+ - 6-126x faster processing than comparable frameworks
197
+ - Smallest installation footprint and memory usage
198
+ - Only framework with built-in async/await support
199
+ - Supports both CPU and GPU processing
200
+ - Built by software engineers for production reliability
201
+
202
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
187
203
 
188
204
  ## Documentation
189
205
 
@@ -191,34 +207,13 @@ kreuzberg extract *.pdf --output-dir ./extracted/
191
207
 
192
208
  - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
193
209
  - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
210
+ - [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
194
211
  - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
195
212
  - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
196
213
  - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
197
214
  - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
198
215
  - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
199
216
 
200
- ## Advanced Features
201
-
202
- - **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
203
- - **📊 Table Extraction**: Extract tables from PDFs with GMFT
204
- - **🧩 Content Chunking**: Split documents for RAG applications
205
- - **🎯 Custom Extractors**: Extend with your own document handlers
206
- - **🔧 Configuration**: Flexible TOML-based configuration
207
- - **🪝 Hooks**: Pre/post-processing customization
208
- - **🌍 Multi-language OCR**: 100+ languages supported
209
- - **⚙️ Metadata Extraction**: Rich document metadata
210
- - **🔄 Batch Processing**: Efficient bulk document processing
211
-
212
217
  ## License
213
218
 
214
219
  MIT License - see [LICENSE](LICENSE) for details.
215
-
216
- ______________________________________________________________________
217
-
218
- <div align="center">
219
-
220
- **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
221
-
222
- Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
223
-
224
- </div>
@@ -3,9 +3,12 @@ name = "kreuzberg-benchmarks"
3
3
  version = "0.1.0"
4
4
  description = "Performance benchmarking suite for Kreuzberg text extraction library"
5
5
  readme = "README.md"
6
- requires-python = ">=3.13"
6
+ requires-python = ">=3.10"
7
7
  classifiers = [
8
8
  "Programming Language :: Python :: 3 :: Only",
9
+ "Programming Language :: Python :: 3.10",
10
+ "Programming Language :: Python :: 3.11",
11
+ "Programming Language :: Python :: 3.12",
9
12
  "Programming Language :: Python :: 3.13",
10
13
  ]
11
14
  dependencies = [