kreuzberg 3.11.1__tar.gz → 3.11.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.pre-commit-config.yaml +7 -9
  2. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/PKG-INFO +4 -4
  3. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_gmft.py +28 -10
  4. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/pyproject.toml +4 -4
  5. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/gmft_test.py +119 -0
  6. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/uv.lock +110 -93
  7. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.commitlintrc +0 -0
  8. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.deepsource.toml +0 -0
  9. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.docker/Dockerfile +0 -0
  10. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.docker/README.md +0 -0
  11. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.dockerignore +0 -0
  12. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/dependabot.yaml +0 -0
  13. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/ci.yaml +0 -0
  14. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/docs.yml +0 -0
  15. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/pr-title.yaml +0 -0
  16. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/publish-docker.yml +0 -0
  17. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/release.yaml +0 -0
  18. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.gitignore +0 -0
  19. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.markdownlint.yaml +0 -0
  20. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/LICENSE +0 -0
  21. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/README.md +0 -0
  22. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/ai-rulez.yaml +0 -0
  23. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/README.md +0 -0
  24. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/benchmark_baseline.py +0 -0
  25. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/end_to_end_benchmark.py +0 -0
  26. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/final_benchmark.py +0 -0
  27. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/pyproject.toml +0 -0
  28. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/baseline_results.json +0 -0
  29. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  30. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
  31. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/final_benchmark_results.json +0 -0
  32. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/latest.json +0 -0
  33. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/mime_caching_results.json +0 -0
  34. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/msgspec_caching_results.json +0 -0
  35. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/ocr_caching_results.json +0 -0
  36. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
  37. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
  38. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/table_caching_results.json +0 -0
  39. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/serialization_benchmark.py +0 -0
  40. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  41. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  42. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  43. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  44. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  45. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  46. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  47. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/statistical_benchmark.py +0 -0
  48. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/custom-extractors.md +0 -0
  49. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/custom-hooks.md +0 -0
  50. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/error-handling.md +0 -0
  51. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/index.md +0 -0
  52. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/performance.md +0 -0
  53. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/exceptions.md +0 -0
  54. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/extraction-functions.md +0 -0
  55. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/extractor-registry.md +0 -0
  56. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/index.md +0 -0
  57. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/ocr-configuration.md +0 -0
  58. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/types.md +0 -0
  59. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/assets/favicon.png +0 -0
  60. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/assets/logo.png +0 -0
  61. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/cli.md +0 -0
  62. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/contributing.md +0 -0
  63. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/css/extra.css +0 -0
  64. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/examples/extraction-examples.md +0 -0
  65. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/examples/index.md +0 -0
  66. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/getting-started/index.md +0 -0
  67. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/getting-started/installation.md +0 -0
  68. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/getting-started/quick-start.md +0 -0
  69. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/index.md +0 -0
  70. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/performance-analysis.md +0 -0
  71. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/api-server.md +0 -0
  72. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/basic-usage.md +0 -0
  73. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/chunking.md +0 -0
  74. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/docker.md +0 -0
  75. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/document-classification.md +0 -0
  76. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/extraction-configuration.md +0 -0
  77. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/index.md +0 -0
  78. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/mcp-server.md +0 -0
  79. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/metadata-extraction.md +0 -0
  80. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/ocr-backends.md +0 -0
  81. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/ocr-configuration.md +0 -0
  82. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/supported-formats.md +0 -0
  83. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/__init__.py +0 -0
  84. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/__main__.py +0 -0
  85. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_api/__init__.py +0 -0
  86. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_api/main.py +0 -0
  87. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_chunker.py +0 -0
  88. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_config.py +0 -0
  89. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_constants.py +0 -0
  90. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_document_classification.py +0 -0
  91. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_entity_extraction.py +0 -0
  92. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/__init__.py +0 -0
  93. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_base.py +0 -0
  94. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_email.py +0 -0
  95. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_html.py +0 -0
  96. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_image.py +0 -0
  97. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_pandoc.py +0 -0
  98. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_pdf.py +0 -0
  99. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_presentation.py +0 -0
  100. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  101. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_structured.py +0 -0
  102. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_language_detection.py +0 -0
  103. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_mcp/__init__.py +0 -0
  104. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_mcp/server.py +0 -0
  105. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_mime_types.py +0 -0
  106. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/__init__.py +0 -0
  107. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_base.py +0 -0
  108. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_easyocr.py +0 -0
  109. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
  110. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_tesseract.py +0 -0
  111. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_playa.py +0 -0
  112. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_registry.py +0 -0
  113. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_types.py +0 -0
  114. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/__init__.py +0 -0
  115. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_cache.py +0 -0
  116. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_device.py +0 -0
  117. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_document_cache.py +0 -0
  118. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_errors.py +0 -0
  119. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
  120. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_process_pool.py +0 -0
  121. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_quality.py +0 -0
  122. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_serialization.py +0 -0
  123. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_string.py +0 -0
  124. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_sync.py +0 -0
  125. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_table.py +0 -0
  126. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_tmp.py +0 -0
  127. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/cli.py +0 -0
  128. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/exceptions.py +0 -0
  129. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/extraction.py +0 -0
  130. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/py.typed +0 -0
  131. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/mkdocs.yaml +0 -0
  132. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/__init__.py +0 -0
  133. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/api/__init__.py +0 -0
  134. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/api/main_test.py +0 -0
  135. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/chunker_test.py +0 -0
  136. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/cli_command_test.py +0 -0
  137. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/cli_integration_test.py +0 -0
  138. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/cli_test.py +0 -0
  139. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/config_test.py +0 -0
  140. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/conftest.py +0 -0
  141. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/document_classification_test.py +0 -0
  142. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/entity_extraction_test.py +0 -0
  143. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/exceptions_test.py +0 -0
  144. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extraction_batch_test.py +0 -0
  145. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extraction_test.py +0 -0
  146. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/__init__.py +0 -0
  147. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/email_test.py +0 -0
  148. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/html_test.py +0 -0
  149. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/image_test.py +0 -0
  150. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/pandoc_metadata_test.py +0 -0
  151. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/pandoc_test.py +0 -0
  152. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/pdf_test.py +0 -0
  153. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/presentation_test.py +0 -0
  154. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/spreed_sheet_test.py +0 -0
  155. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/structured_test.py +0 -0
  156. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/gmft_extended_test.py +0 -0
  157. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/hooks_test.py +0 -0
  158. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/language_detection_test.py +0 -0
  159. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/mcp_server_test.py +0 -0
  160. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/mime_types_test.py +0 -0
  161. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/__init__.py +0 -0
  162. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
  163. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  164. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/process_manager_test.py +0 -0
  165. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  166. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/__init__.py +0 -0
  167. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/base_test.py +0 -0
  168. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/device_integration_test.py +0 -0
  169. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/easyocr_test.py +0 -0
  170. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/init_test.py +0 -0
  171. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/paddleocr_test.py +0 -0
  172. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/tesseract_test.py +0 -0
  173. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/playa_helpers_test.py +0 -0
  174. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/playa_test.py +0 -0
  175. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/registry_test.py +0 -0
  176. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/better-ocr-image.jpg +0 -0
  177. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/contract.txt +0 -0
  178. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/contract_test.txt +0 -0
  179. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/document.docx +0 -0
  180. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/email/sample-email.eml +0 -0
  181. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  182. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/excel.xlsx +0 -0
  183. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/form_test.txt +0 -0
  184. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/french-text.txt +0 -0
  185. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/german-text.txt +0 -0
  186. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/html.html +0 -0
  187. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/images/test_hello_world.png +0 -0
  188. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/invoice_image.png +0 -0
  189. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/invoice_test.txt +0 -0
  190. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/json/sample-document.json +0 -0
  191. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  192. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/markdown.md +0 -0
  193. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
  194. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/non-searchable.pdf +0 -0
  195. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/ocr-image.jpg +0 -0
  196. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  197. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  198. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  199. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  200. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/receipt_test.txt +0 -0
  201. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/report_test.txt +0 -0
  202. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/sample-contract.pdf +0 -0
  203. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/scanned.pdf +0 -0
  204. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/searchable.pdf +0 -0
  205. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/spanish-text.txt +0 -0
  206. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/test-article.pdf +0 -0
  207. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  208. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/types_test.py +0 -0
  209. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/__init__.py +0 -0
  210. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/cache_test.py +0 -0
  211. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/device_test.py +0 -0
  212. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/errors_test.py +0 -0
  213. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/pdf_lock_test.py +0 -0
  214. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/process_pool_test.py +0 -0
  215. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/serialization_test.py +0 -0
  216. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/string_test.py +0 -0
  217. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/sync_test.py +0 -0
  218. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/table_test.py +0 -0
  219. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/tmp_test.py +0 -0
  220. {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils_errors_test.py +0 -0
@@ -5,13 +5,11 @@ repos:
5
5
  - id: commitlint
6
6
  stages: [commit-msg]
7
7
  additional_dependencies: ["@commitlint/config-conventional"]
8
- # Temporarily disabled - ai-rulez Go build failing in CI
9
- # TODO: Re-enable once ai-rulez v1.4.4+ Python migration is stable
10
- # - repo: https://github.com/Goldziher/ai-rulez
11
- # rev: v1.4.3
12
- # hooks:
13
- # - id: ai-rulez-validate
14
- # - id: ai-rulez-generate
8
+ - repo: https://github.com/Goldziher/ai-rulez
9
+ rev: v1.5.1
10
+ hooks:
11
+ - id: ai-rulez-validate
12
+ - id: ai-rulez-generate
15
13
  - repo: https://github.com/pre-commit/pre-commit-hooks
16
14
  rev: v6.0.0
17
15
  hooks:
@@ -55,7 +53,7 @@ repos:
55
53
  hooks:
56
54
  - id: pyproject-fmt
57
55
  - repo: https://github.com/astral-sh/ruff-pre-commit
58
- rev: v0.12.8
56
+ rev: v0.12.9
59
57
  hooks:
60
58
  - id: ruff
61
59
  args: ["--fix", "--unsafe-fixes"]
@@ -68,7 +66,7 @@ repos:
68
66
  additional_dependencies:
69
67
  - tomli
70
68
  - repo: https://github.com/jsh9/pydoclint
71
- rev: 0.6.7
69
+ rev: 0.6.10
72
70
  hooks:
73
71
  - id: pydoclint
74
72
  args:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.11.1
3
+ Version: 3.11.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
- Requires-Dist: mcp>=1.12.4
35
+ Requires-Dist: mcp>=1.13.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: playa-pdf>=0.7.0
38
38
  Requires-Dist: psutil>=7.0.0
@@ -52,7 +52,7 @@ Requires-Dist: gmft>=0.4.2; extra == 'all'
52
52
  Requires-Dist: keybert>=0.9.0; extra == 'all'
53
53
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
54
54
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
55
- Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
+ Requires-Dist: paddleocr>=3.1.1; extra == 'all'
56
56
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
57
57
  Requires-Dist: pandas>=2.3.1; extra == 'all'
58
58
  Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
@@ -84,7 +84,7 @@ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
84
84
  Provides-Extra: langdetect
85
85
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
86
86
  Provides-Extra: paddleocr
87
- Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
87
+ Requires-Dist: paddleocr>=3.1.1; extra == 'paddleocr'
88
88
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
89
89
  Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
90
90
  Description-Content-Type: text/markdown
@@ -444,14 +444,26 @@ def _extract_tables_in_process(
444
444
  cropped_image.save(img_bytes, format="PNG")
445
445
  img_bytes.seek(0)
446
446
 
447
- results.append(
448
- {
449
- "cropped_image_bytes": img_bytes.getvalue(),
450
- "page_number": cropped_table.page.page_number,
451
- "text": data_frame.to_markdown(),
452
- "df_csv": data_frame.to_csv(index=False),
453
- }
454
- )
447
+ if data_frame.empty:
448
+ results.append(
449
+ {
450
+ "cropped_image_bytes": img_bytes.getvalue(),
451
+ "page_number": cropped_table.page.page_number,
452
+ "text": data_frame.to_markdown(),
453
+ "df_columns": data_frame.columns.tolist(),
454
+ "df_csv": None,
455
+ }
456
+ )
457
+ else:
458
+ results.append(
459
+ {
460
+ "cropped_image_bytes": img_bytes.getvalue(),
461
+ "page_number": cropped_table.page.page_number,
462
+ "text": data_frame.to_markdown(),
463
+ "df_columns": None,
464
+ "df_csv": data_frame.to_csv(index=False),
465
+ }
466
+ )
455
467
 
456
468
  result_queue.put((True, results))
457
469
 
@@ -532,7 +544,10 @@ def _extract_tables_isolated(
532
544
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
533
545
  import pandas as pd # noqa: PLC0415
534
546
 
535
- df = pd.read_csv(StringIO(table_dict["df_csv"]))
547
+ if table_dict["df_csv"] is None:
548
+ df = pd.DataFrame(columns=table_dict["df_columns"])
549
+ else:
550
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
536
551
 
537
552
  tables.append(
538
553
  TableData(
@@ -638,7 +653,10 @@ async def _extract_tables_isolated_async(
638
653
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
639
654
  import pandas as pd # noqa: PLC0415
640
655
 
641
- df = pd.read_csv(StringIO(table_dict["df_csv"]))
656
+ if table_dict["df_csv"] is None:
657
+ df = pd.DataFrame(columns=table_dict["df_columns"])
658
+ else:
659
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
642
660
 
643
661
  tables.append(
644
662
  TableData(
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
5
5
 
6
6
  [project]
7
7
  name = "kreuzberg"
8
- version = "3.11.1"
8
+ version = "3.11.2"
9
9
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -61,7 +61,7 @@ dependencies = [
61
61
  "chardetng-py>=0.3.5",
62
62
  "exceptiongroup>=1.2.2; python_version<'3.11'",
63
63
  "html-to-markdown[lxml]>=1.9.0",
64
- "mcp>=1.12.4",
64
+ "mcp>=1.13.0",
65
65
  "msgspec>=0.18.0",
66
66
  "playa-pdf>=0.7.0", # pinned due to breaking changes in 0.5.0
67
67
  "psutil>=7.0.0",
@@ -97,7 +97,7 @@ optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
97
97
  optional-dependencies.gmft = [ "gmft>=0.4.2" ]
98
98
  optional-dependencies.langdetect = [ "fast-langdetect>=0.3.2" ]
99
99
  optional-dependencies.paddleocr = [
100
- "paddleocr>=3.1.0",
100
+ "paddleocr>=3.1.1",
101
101
  "paddlepaddle>=3.1.0",
102
102
  "setuptools>=80.9.0",
103
103
  ]
@@ -117,7 +117,7 @@ dev = [
117
117
  "pytest-mock>=3.14.0",
118
118
  "pytest-rerunfailures>=15.1",
119
119
  "pytest-timeout>=2.4.0",
120
- "ruff>=0.12.8",
120
+ "ruff>=0.12.9",
121
121
  "trio>=0.30.0",
122
122
  "uv-bump",
123
123
  ]
@@ -669,6 +669,125 @@ class TestGMFTInlineExtractionEdgeCases:
669
669
  pytest.skip("GMFT dependency not available for inline testing")
670
670
 
671
671
 
672
+ class TestGMFTWithoutTables:
673
+ """Test GMFT behavior with PDFs that have no tables - issue #104."""
674
+
675
+ @pytest.mark.anyio
676
+ async def test_extract_tables_pdf_without_tables_async(self) -> None:
677
+ """Test that extract_tables handles PDFs without tables gracefully (async)."""
678
+ pdf_path = Path("tests/test_source_files/searchable.pdf")
679
+
680
+ try:
681
+ tables = await extract_tables(pdf_path)
682
+
683
+ assert isinstance(tables, list)
684
+
685
+ for table in tables:
686
+ assert "page_number" in table
687
+ assert "text" in table
688
+ assert "df" in table
689
+ assert "cropped_image" in table
690
+
691
+ import pandas as pd
692
+
693
+ assert isinstance(table["df"], pd.DataFrame)
694
+ except MissingDependencyError:
695
+ pytest.skip("GMFT dependency not installed")
696
+
697
+ def test_extract_tables_pdf_without_tables_sync(self) -> None:
698
+ """Test that extract_tables_sync handles PDFs without tables gracefully (sync)."""
699
+ # Using searchable.pdf which is a simple text PDF without tables
700
+ pdf_path = Path("tests/test_source_files/searchable.pdf")
701
+
702
+ try:
703
+ tables = extract_tables_sync(pdf_path)
704
+
705
+ assert isinstance(tables, list)
706
+
707
+ for table in tables:
708
+ assert "page_number" in table
709
+ assert "text" in table
710
+ assert "df" in table
711
+ assert "cropped_image" in table
712
+
713
+ import pandas as pd
714
+
715
+ assert isinstance(table["df"], pd.DataFrame)
716
+ except MissingDependencyError:
717
+ pytest.skip("GMFT dependency not installed")
718
+
719
+ @pytest.mark.anyio
720
+ async def test_extract_file_with_gmft_pdf_without_tables(self) -> None:
721
+ """Test that extract_file with extract_tables=True handles PDFs without tables gracefully."""
722
+ pdf_path = Path("tests/test_source_files/searchable.pdf")
723
+
724
+ config = ExtractionConfig(
725
+ extract_tables=True,
726
+ gmft_config=GMFTConfig(
727
+ detector_base_threshold=0.85,
728
+ remove_null_rows=True,
729
+ enable_multi_header=True,
730
+ ),
731
+ )
732
+
733
+ try:
734
+ result = await extract_file(pdf_path, config=config)
735
+
736
+ assert result.content
737
+ assert "Sample PDF" in result.content
738
+
739
+ assert hasattr(result, "tables")
740
+ assert isinstance(result.tables, list)
741
+
742
+ for table in result.tables:
743
+ assert "page_number" in table
744
+ assert "text" in table
745
+ assert "df" in table
746
+ assert "cropped_image" in table
747
+
748
+ import pandas as pd
749
+
750
+ assert isinstance(table["df"], pd.DataFrame)
751
+ except MissingDependencyError:
752
+ pytest.skip("GMFT dependency not installed")
753
+
754
+ def test_extract_file_sync_with_gmft_pdf_without_tables(self) -> None:
755
+ """Test that extract_file_sync with extract_tables=True handles PDFs without tables gracefully."""
756
+ pdf_path = Path("tests/test_source_files/searchable.pdf")
757
+
758
+ from kreuzberg.extraction import extract_file_sync
759
+
760
+ config = ExtractionConfig(
761
+ extract_tables=True,
762
+ gmft_config=GMFTConfig(
763
+ detector_base_threshold=0.85,
764
+ remove_null_rows=True,
765
+ enable_multi_header=True,
766
+ ),
767
+ )
768
+
769
+ try:
770
+ result = extract_file_sync(pdf_path, config=config)
771
+
772
+ assert result.content
773
+ assert "Sample PDF" in result.content
774
+
775
+ assert hasattr(result, "tables")
776
+ assert isinstance(result.tables, list)
777
+
778
+ for table in result.tables:
779
+ assert "page_number" in table
780
+ assert "text" in table
781
+ assert "df" in table
782
+ assert "cropped_image" in table
783
+
784
+ import pandas as pd
785
+
786
+ assert isinstance(table["df"], pd.DataFrame)
787
+ except MissingDependencyError:
788
+ pytest.skip("GMFT dependency not installed")
789
+
790
+
672
791
  class TestGMFTConfigSerialization:
673
792
  """Test GMFTConfig serialization for multiprocessing."""
674
793