kreuzberg 3.8.2__tar.gz → 3.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. kreuzberg-3.9.1/.deepsource.toml +54 -0
  2. kreuzberg-3.9.1/.github/workflows/ci.yaml +197 -0
  3. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.gitignore +3 -0
  4. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.pre-commit-config.yaml +1 -1
  5. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/PKG-INFO +17 -13
  6. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/README.md +6 -5
  7. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/end_to_end_benchmark.py +1 -1
  8. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/statistical_benchmark.py +1 -1
  9. kreuzberg-3.9.1/docs/changelog.md +49 -0
  10. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/index.md +1 -0
  11. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/basic-usage.md +28 -0
  12. kreuzberg-3.9.1/docs/user-guide/document-classification.md +53 -0
  13. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/extraction-configuration.md +6 -0
  14. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/index.md +1 -0
  15. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_chunker.py +3 -3
  16. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_config.py +1 -1
  17. kreuzberg-3.9.1/kreuzberg/_document_classification.py +156 -0
  18. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_entity_extraction.py +3 -3
  19. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_image.py +4 -3
  20. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_pdf.py +18 -10
  21. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_spread_sheet.py +4 -5
  22. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_structured.py +24 -18
  23. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_gmft.py +25 -31
  24. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_mime_types.py +1 -1
  25. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_base.py +1 -1
  26. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_easyocr.py +4 -4
  27. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_paddleocr.py +3 -3
  28. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_tesseract.py +10 -14
  29. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_types.py +23 -7
  30. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_cache.py +2 -3
  31. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_device.py +7 -7
  32. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/cli.py +2 -2
  33. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/extraction.py +18 -9
  34. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/mkdocs.yaml +1 -0
  35. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/pyproject.toml +27 -10
  36. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/conftest.py +5 -0
  37. kreuzberg-3.9.1/tests/document_classification_test.py +86 -0
  38. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/entity_extraction_test.py +2 -2
  39. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/pdf_test.py +0 -2
  40. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/gmft_test.py +3 -3
  41. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/gmft_integration_test.py +2 -1
  42. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/gmft_isolated_test.py +5 -9
  43. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/device_integration_test.py +14 -13
  44. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/paddleocr_test.py +0 -5
  45. kreuzberg-3.9.1/tests/test_source_files/contract_test.txt +4 -0
  46. kreuzberg-3.9.1/tests/test_source_files/form_test.txt +5 -0
  47. kreuzberg-3.9.1/tests/test_source_files/invoice_image.png +0 -0
  48. kreuzberg-3.9.1/tests/test_source_files/invoice_test.txt +4 -0
  49. kreuzberg-3.9.1/tests/test_source_files/receipt_test.txt +5 -0
  50. kreuzberg-3.9.1/tests/test_source_files/report_test.txt +4 -0
  51. kreuzberg-3.9.1/tests/utils/__init__.py +0 -0
  52. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/uv.lock +910 -715
  53. kreuzberg-3.8.2/.github/workflows/ci.yaml +0 -124
  54. kreuzberg-3.8.2/docs/changelog.md +0 -32
  55. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.commitlintrc +0 -0
  56. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.docker/Dockerfile +0 -0
  57. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.docker/README.md +0 -0
  58. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.dockerignore +0 -0
  59. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/dependabot.yaml +0 -0
  60. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/docs.yml +0 -0
  61. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/pr-title.yaml +0 -0
  62. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/publish-docker.yml +0 -0
  63. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/release.yaml +0 -0
  64. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.markdownlint.yaml +0 -0
  65. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/LICENSE +0 -0
  66. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/ai-rulez.yaml +0 -0
  67. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/README.md +0 -0
  68. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/benchmark_baseline.py +0 -0
  69. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/final_benchmark.py +0 -0
  70. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/pyproject.toml +0 -0
  71. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/baseline_results.json +0 -0
  72. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  73. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  74. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/final_benchmark_results.json +0 -0
  75. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/latest.json +0 -0
  76. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/mime_caching_results.json +0 -0
  77. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  78. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/ocr_caching_results.json +0 -0
  79. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  80. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  81. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/table_caching_results.json +0 -0
  82. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/serialization_benchmark.py +0 -0
  83. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  84. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  85. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  86. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  87. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  88. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  89. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  90. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/custom-extractors.md +0 -0
  91. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/custom-hooks.md +0 -0
  92. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/error-handling.md +0 -0
  93. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/index.md +0 -0
  94. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/performance.md +0 -0
  95. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/exceptions.md +0 -0
  96. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/extraction-functions.md +0 -0
  97. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/extractor-registry.md +0 -0
  98. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/index.md +0 -0
  99. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/ocr-configuration.md +0 -0
  100. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/types.md +0 -0
  101. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/assets/favicon.png +0 -0
  102. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/assets/logo.png +0 -0
  103. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/cli.md +0 -0
  104. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/contributing.md +0 -0
  105. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/css/extra.css +0 -0
  106. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/examples/extraction-examples.md +0 -0
  107. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/examples/index.md +0 -0
  108. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/getting-started/index.md +0 -0
  109. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/getting-started/installation.md +0 -0
  110. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/getting-started/quick-start.md +0 -0
  111. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/performance-analysis.md +0 -0
  112. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/api-server.md +0 -0
  113. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/chunking.md +0 -0
  114. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/docker.md +0 -0
  115. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/mcp-server.md +0 -0
  116. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/metadata-extraction.md +0 -0
  117. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/ocr-backends.md +0 -0
  118. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/ocr-configuration.md +0 -0
  119. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/supported-formats.md +0 -0
  120. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/__init__.py +0 -0
  121. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/__main__.py +0 -0
  122. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_api/__init__.py +0 -0
  123. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_api/main.py +0 -0
  124. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_constants.py +0 -0
  125. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/__init__.py +0 -0
  126. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_base.py +0 -0
  127. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_email.py +0 -0
  128. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_html.py +0 -0
  129. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  130. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_presentation.py +0 -0
  131. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_language_detection.py +0 -0
  132. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_mcp/__init__.py +0 -0
  133. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_mcp/server.py +0 -0
  134. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/__init__.py +0 -0
  135. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_playa.py +0 -0
  136. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_registry.py +0 -0
  137. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/__init__.py +0 -0
  138. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_document_cache.py +0 -0
  139. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_errors.py +0 -0
  140. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  141. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_process_pool.py +0 -0
  142. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_quality.py +0 -0
  143. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_serialization.py +0 -0
  144. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_string.py +0 -0
  145. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_sync.py +0 -0
  146. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_table.py +0 -0
  147. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_tmp.py +0 -0
  148. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/exceptions.py +0 -0
  149. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/py.typed +0 -0
  150. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/__init__.py +0 -0
  151. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/api/__init__.py +0 -0
  152. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/api/main_test.py +0 -0
  153. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/chunker_test.py +0 -0
  154. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/cli_integration_test.py +0 -0
  155. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/cli_test.py +0 -0
  156. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/config_test.py +0 -0
  157. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/exceptions_test.py +0 -0
  158. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extraction_batch_test.py +0 -0
  159. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extraction_test.py +0 -0
  160. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/__init__.py +0 -0
  161. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/email_comprehensive_test.py +0 -0
  162. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/email_test.py +0 -0
  163. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/html_test.py +0 -0
  164. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/image_test.py +0 -0
  165. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  166. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/pandoc_test.py +0 -0
  167. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/presentation_test.py +0 -0
  168. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/spreed_sheet_test.py +0 -0
  169. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/structured_test.py +0 -0
  170. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/gmft_extended_test.py +0 -0
  171. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/hooks_test.py +0 -0
  172. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/language_detection_test.py +0 -0
  173. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/mcp_server_test.py +0 -0
  174. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/mime_types_test.py +0 -0
  175. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/__init__.py +0 -0
  176. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/process_manager_test.py +0 -0
  177. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  178. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/__init__.py +0 -0
  179. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/base_test.py +0 -0
  180. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/easyocr_test.py +0 -0
  181. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/init_test.py +0 -0
  182. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/tesseract_test.py +0 -0
  183. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/playa_test.py +0 -0
  184. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/registry_test.py +0 -0
  185. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
  186. /kreuzberg-3.8.2/tests/utils/__init__.py → /kreuzberg-3.9.1/tests/test_source_files/contract.txt +0 -0
  187. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/document.docx +0 -0
  188. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/email/sample-email.eml +0 -0
  189. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  190. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/excel.xlsx +0 -0
  191. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/french-text.txt +0 -0
  192. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/german-text.txt +0 -0
  193. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/html.html +0 -0
  194. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/images/test_hello_world.png +0 -0
  195. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/json/sample-document.json +0 -0
  196. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  197. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/markdown.md +0 -0
  198. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  199. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/non-searchable.pdf +0 -0
  200. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/ocr-image.jpg +0 -0
  201. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  202. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  203. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  204. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  205. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/sample-contract.pdf +0 -0
  206. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/scanned.pdf +0 -0
  207. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/searchable.pdf +0 -0
  208. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/spanish-text.txt +0 -0
  209. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/test-article.pdf +0 -0
  210. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  211. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/types_test.py +0 -0
  212. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/cache_test.py +0 -0
  213. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/device_test.py +0 -0
  214. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/errors_test.py +0 -0
  215. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/pdf_lock_test.py +0 -0
  216. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/process_pool_test.py +0 -0
  217. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/serialization_test.py +0 -0
  218. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/string_test.py +0 -0
  219. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/sync_test.py +0 -0
  220. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/table_test.py +0 -0
  221. {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/tmp_test.py +0 -0
@@ -0,0 +1,54 @@
1
+ version = 1
2
+
3
+ test_patterns = ["tests/**"]
4
+
5
+ exclude_patterns = [
6
+ # Virtual environments
7
+ ".venv/**",
8
+ "venv/**",
9
+
10
+ # Build and distribution artifacts
11
+ "dist/**",
12
+ "build/**",
13
+ "*.egg-info/**",
14
+
15
+ # Documentation
16
+ "docs/**",
17
+ "site/**",
18
+
19
+ # Cache directories
20
+ "**/__pycache__/**",
21
+ ".pytest_cache/**",
22
+ ".mypy_cache/**",
23
+ ".ruff_cache/**",
24
+ ".coverage",
25
+ "htmlcov/**",
26
+
27
+ # Benchmarks and performance tests
28
+ "benchmarks/**",
29
+
30
+ # IDE and editor files
31
+ ".idea/**",
32
+ ".vscode/**",
33
+
34
+ # Version control
35
+ ".git/**",
36
+
37
+ # Temporary and generated files
38
+ "*.pyc",
39
+ ".DS_Store",
40
+ "*.swp",
41
+ "*.swo",
42
+ ]
43
+
44
+ [[analyzers]]
45
+ name = "test-coverage"
46
+
47
+ [[analyzers]]
48
+ name = "python"
49
+
50
+ [analyzers.meta]
51
+ runtime_version = "3.x.x"
52
+
53
+ [[transformers]]
54
+ name = "ruff"
@@ -0,0 +1,197 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+ push:
8
+ branches:
9
+ - main
10
+ - feat/smart-multiprocessing
11
+
12
+ jobs:
13
+ validate:
14
+ runs-on: ubuntu-latest
15
+ timeout-minutes: 10
16
+ steps:
17
+ - name: Checkout
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Install uv
21
+ uses: astral-sh/setup-uv@v6
22
+ with:
23
+ enable-cache: true
24
+
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v5
27
+ with:
28
+ python-version-file: "pyproject.toml"
29
+
30
+ - name: Install Dependencies
31
+ uses: nick-fields/retry@v3
32
+ with:
33
+ timeout_minutes: 5
34
+ max_attempts: 3
35
+ retry_wait_seconds: 30
36
+ command: |
37
+ if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
38
+ echo "Removing existing .venv directory on Windows"
39
+ rm -rf .venv
40
+ fi
41
+ uv sync --all-packages --all-extras --dev
42
+ shell: bash
43
+
44
+ - name: Load Cached Pre-Commit Dependencies
45
+ id: cached-pre-commit-dependencies
46
+ uses: actions/cache@v4
47
+ with:
48
+ path: ~/.cache/pre-commit/
49
+ key: pre-commit|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
50
+
51
+ - name: Execute Pre-Commit
52
+ run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
53
+
54
+ test:
55
+ strategy:
56
+ matrix:
57
+ os: [ ubuntu-latest, macOS-latest, windows-latest ]
58
+ python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
59
+ runs-on: ${{ matrix.os }}
60
+ timeout-minutes: 30
61
+ steps:
62
+ - name: Checkout
63
+ uses: actions/checkout@v4
64
+
65
+ - name: Install uv
66
+ uses: astral-sh/setup-uv@v6
67
+ with:
68
+ enable-cache: true
69
+
70
+ - name: Install Python
71
+ uses: actions/setup-python@v5
72
+ id: setup-python
73
+ with:
74
+ python-version: ${{ matrix.python }}
75
+
76
+ - name: Cache Python Dependencies
77
+ id: python-cache
78
+ uses: actions/cache@v4
79
+ with:
80
+ path: |
81
+ ~/.cache/uv
82
+ .venv
83
+ key: python-dependencies-${{ matrix.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('uv.lock') }}
84
+ restore-keys: |
85
+ python-dependencies-${{ matrix.os }}-${{ matrix.python }}-
86
+
87
+ - name: Install Dependencies
88
+ uses: nick-fields/retry@v3
89
+ with:
90
+ timeout_minutes: 5
91
+ max_attempts: 3
92
+ retry_wait_seconds: 30
93
+ command: |
94
+ if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
95
+ echo "Removing existing .venv directory on Windows"
96
+ rm -rf .venv
97
+ fi
98
+ uv sync --all-packages --all-extras --dev
99
+ shell: bash
100
+
101
+ - name: Cache Test Artifacts
102
+ uses: actions/cache@v4
103
+ with:
104
+ path: .pytest_cache/
105
+ key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
106
+
107
+ - name: Cache and Install Homebrew (macOS)
108
+ if: runner.os == 'macOS'
109
+ uses: nick-fields/retry@v3
110
+ with:
111
+ timeout_minutes: 10
112
+ max_attempts: 3
113
+ retry_wait_seconds: 30
114
+ command: |
115
+ # Using the underlying homebrew commands instead of the action
116
+ brew update || true
117
+ brew install tesseract tesseract-lang pandoc || brew upgrade tesseract tesseract-lang pandoc || true
118
+ brew list tesseract tesseract-lang pandoc
119
+ shell: bash
120
+
121
+ - name: Cache and Install APT Packages (Linux)
122
+ if: runner.os == 'Linux'
123
+ uses: nick-fields/retry@v3
124
+ with:
125
+ timeout_minutes: 5
126
+ max_attempts: 3
127
+ retry_wait_seconds: 30
128
+ command: |
129
+ sudo apt-get update
130
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
131
+ shell: bash
132
+
133
+ - name: Install System Dependencies (Windows)
134
+ if: runner.os == 'Windows'
135
+ uses: nick-fields/retry@v3
136
+ with:
137
+ timeout_minutes: 10
138
+ max_attempts: 3
139
+ retry_wait_seconds: 30
140
+ command: |
141
+ choco install -y tesseract pandoc --no-progress
142
+ Write-Output "C:\Program Files\Tesseract-OCR" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
143
+ Write-Output "C:\Program Files\Pandoc" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
144
+ $env:PATH = "C:\Program Files\Tesseract-OCR;C:\Program Files\Pandoc;" + $env:PATH
145
+ tesseract --version
146
+ pandoc --version
147
+ shell: pwsh
148
+
149
+ - name: Clean Coverage Data
150
+ run: |
151
+ rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
152
+ shell: bash
153
+
154
+ - name: Run Tests with Coverage
155
+ run: |
156
+ uv run coverage erase
157
+ uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
158
+
159
+ - name: Upload Coverage Artifacts
160
+ if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
161
+ uses: actions/upload-artifact@v4
162
+ with:
163
+ name: coverage-report
164
+ path: coverage.lcov
165
+ retention-days: 1
166
+
167
+ upload-coverage:
168
+ needs: test
169
+ runs-on: ubuntu-latest
170
+ if: github.event_name == 'push' || github.event_name == 'pull_request'
171
+ steps:
172
+ - name: Checkout
173
+ uses: actions/checkout@v4
174
+ with:
175
+ ref: ${{ github.event.pull_request.head.sha || github.sha }}
176
+
177
+ - name: Download Coverage Artifacts
178
+ uses: actions/download-artifact@v4
179
+ with:
180
+ name: coverage-report
181
+ path: .
182
+
183
+ - name: Install DeepSource CLI
184
+ uses: nick-fields/retry@v3
185
+ with:
186
+ timeout_minutes: 3
187
+ max_attempts: 3
188
+ retry_wait_seconds: 10
189
+ command: |
190
+ curl -fsSL https://deepsource.io/cli | sh
191
+ shell: bash
192
+
193
+ - name: Upload Coverage to DeepSource
194
+ env:
195
+ DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
196
+ run: |
197
+ ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
@@ -1,5 +1,6 @@
1
1
  *$py.class
2
2
  *.Cache
3
+ .clause/
3
4
  *.cscfg
4
5
  *.egg-info/
5
6
  *.log
@@ -9,6 +10,8 @@
9
10
  *temp/
10
11
  .coverage
11
12
  .coverage*
13
+ coverage.lcov
14
+ htmlcov/
12
15
  .cursorrules
13
16
  .dist/
14
17
  .DS_store
@@ -53,7 +53,7 @@ repos:
53
53
  hooks:
54
54
  - id: pyproject-fmt
55
55
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.2
56
+ rev: v0.12.5
57
57
  hooks:
58
58
  - id: ruff
59
59
  args: ["--fix", "--unsafe-fixes"]
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.8.2
3
+ Version: 3.9.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
7
7
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
8
  License: MIT
9
9
  License-File: LICENSE
10
- Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
10
+ Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Information Technology
@@ -29,12 +29,12 @@ Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.9.0
32
- Requires-Dist: chardetng-py>=0.3.4
32
+ Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.8.0
35
- Requires-Dist: mcp>=1.11.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
+ Requires-Dist: mcp>=1.12.2
36
36
  Requires-Dist: msgspec>=0.18.0
37
- Requires-Dist: playa-pdf>=0.6.1
37
+ Requires-Dist: playa-pdf>=0.6.4
38
38
  Requires-Dist: psutil>=7.0.0
39
39
  Requires-Dist: pypdfium2==4.30.0
40
40
  Requires-Dist: python-calamine>=0.3.2
@@ -53,18 +53,21 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
53
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
54
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
55
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
- Requires-Dist: rich>=14.0.0; extra == 'all'
56
+ Requires-Dist: rich>=14.1.0; extra == 'all'
57
57
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
58
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
59
59
  Requires-Dist: spacy>=3.8.7; extra == 'all'
60
60
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
61
61
  Provides-Extra: api
62
62
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
63
+ Provides-Extra: auto-classify-document-type
64
+ Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
65
+ Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
63
66
  Provides-Extra: chunking
64
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
65
68
  Provides-Extra: cli
66
69
  Requires-Dist: click>=8.2.1; extra == 'cli'
67
- Requires-Dist: rich>=14.0.0; extra == 'cli'
70
+ Requires-Dist: rich>=14.1.0; extra == 'cli'
68
71
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
69
72
  Provides-Extra: easyocr
70
73
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
@@ -88,7 +91,7 @@ Description-Content-Type: text/markdown
88
91
  [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
89
92
  [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
90
93
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
91
- [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
94
+ [![DeepSource](https://app.deepsource.com/gh/Goldziher/kreuzberg.svg/?label=code+coverage&show_trend=true&token=U8AW1VWWSLwVhrbtL8LmLBDN)](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
92
95
 
93
96
  **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
94
97
 
@@ -103,6 +106,7 @@ Description-Content-Type: text/markdown
103
106
  - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
104
107
  - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
105
108
  - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
109
+ - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
106
110
 
107
111
  ### Technical Architecture
108
112
 
@@ -126,14 +130,14 @@ Kreuzberg leverages established open source technologies:
126
130
  ### Extract Text with CLI
127
131
 
128
132
  ```bash
129
- # Extract text from any file to markdown
130
- uvx kreuzberg extract document.pdf > output.md
133
+ # Extract text from any file to text format
134
+ uvx kreuzberg extract document.pdf > output.txt
131
135
 
132
136
  # With all features (OCR, table extraction, etc.)
133
- uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
137
+ uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
134
138
 
135
139
  # Extract with rich metadata
136
- uvx kreuzberg extract report.pdf --show-metadata --format json
140
+ uvx kreuzberg extract report.pdf --show-metadata --output-format json
137
141
  ```
138
142
 
139
143
  ### Python Usage
@@ -5,7 +5,7 @@
5
5
  [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
6
6
  [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
- [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
8
+ [![DeepSource](https://app.deepsource.com/gh/Goldziher/kreuzberg.svg/?label=code+coverage&show_trend=true&token=U8AW1VWWSLwVhrbtL8LmLBDN)](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
9
9
 
10
10
  **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
11
11
 
@@ -20,6 +20,7 @@
20
20
  - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
21
21
  - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
22
22
  - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
23
+ - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
23
24
 
24
25
  ### Technical Architecture
25
26
 
@@ -43,14 +44,14 @@ Kreuzberg leverages established open source technologies:
43
44
  ### Extract Text with CLI
44
45
 
45
46
  ```bash
46
- # Extract text from any file to markdown
47
- uvx kreuzberg extract document.pdf > output.md
47
+ # Extract text from any file to text format
48
+ uvx kreuzberg extract document.pdf > output.txt
48
49
 
49
50
  # With all features (OCR, table extraction, etc.)
50
- uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
51
+ uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
51
52
 
52
53
  # Extract with rich metadata
53
- uvx kreuzberg extract report.pdf --show-metadata --format json
54
+ uvx kreuzberg extract report.pdf --show-metadata --output-format json
54
55
  ```
55
56
 
56
57
  ### Python Usage
@@ -43,7 +43,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
43
43
  print(f"Tables: {len(cold_result.tables)}")
44
44
  print(f"Chunks: {len(cold_result.chunks)}")
45
45
 
46
- from kreuzberg._utils._cache import (
46
+ from kreuzberg._utils._cache import ( # noqa: PLC0415
47
47
  get_ocr_cache,
48
48
  get_table_cache,
49
49
  get_mime_cache,
@@ -130,7 +130,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
130
130
  f" Cache consistency: {'✅ STABLE' if warm_clean_stdev / warm_clean_mean < 0.1 else '⚠️ VARIABLE'}"
131
131
  )
132
132
 
133
- from kreuzberg._utils._cache import (
133
+ from kreuzberg._utils._cache import ( # noqa: PLC0415
134
134
  get_ocr_cache,
135
135
  get_table_cache,
136
136
  get_mime_cache,
@@ -0,0 +1,49 @@
1
+ # Changelog
2
+
3
+ All notable changes to Kreuzberg will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [3.9.0] - 2025-01-17
9
+
10
+ ### Added
11
+
12
+ - Automatic Document Type Detection (#88) - A new feature for classifying documents into categories (contract, form, invoice, receipt, report)
13
+ - Integration with Google Translate for multi-language support
14
+ - New optional dependency group `auto-classify-document-type` with `deep-translator` and `pandas`
15
+ - Comprehensive tests and documentation
16
+ - DeepSource integration for code quality analysis
17
+
18
+ ### Fixed
19
+
20
+ - PDF extraction handling when no OCR backend is available
21
+ - Entity extraction test updated to use frozenset of tuples
22
+ - Config handling for dataclasses with `slots=True` - replaced `config.__dict__` with `asdict(config)`
23
+ - Coverage configuration and cleanup issues
24
+
25
+ ### Changed
26
+
27
+ - CI/CD: Added retry logic for flaky steps across all platforms
28
+ - Improved coverage gathering and cleanup in test runs
29
+ - Updated dependencies in `uv.lock`
30
+
31
+ ## [3.8.2] - Previous Release
32
+
33
+ ### Added
34
+
35
+ - Documentation site with comprehensive examples and API reference
36
+ - Improved configuration for all OCR backends
37
+ - Added hooks system for validation and post-processing
38
+ - Language detection feature with `auto_detect_language` configuration option
39
+ - New optional dependency group `langdetect` for automatic language detection
40
+
41
+ ### Changed
42
+
43
+ - Refactored internal structure for better maintainability
44
+ - Updated extraction functions to use config object instead of kwargs
45
+ - Improved error messages and reporting
46
+
47
+ ## Previous Versions
48
+
49
+ For a complete history of changes, please refer to the [GitHub releases page](https://github.com/strickvl/kreuzberg/releases).
@@ -49,6 +49,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
49
49
  - **OCR Engines**: Tesseract (default), EasyOCR, PaddleOCR with automatic fallback strategies
50
50
  - **Data Extraction**: Text content, document metadata, table structures, and embedded resources
51
51
  - **Processing Capabilities**: Content chunking for RAG pipelines, language detection, format preservation
52
+ - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
52
53
  - **Extensibility**: Plugin architecture for custom extractors and hooks
53
54
 
54
55
  ## Architecture Philosophy
@@ -131,3 +131,31 @@ async def show_metadata():
131
131
 
132
132
  asyncio.run(show_metadata())
133
133
  ```
134
+
135
+ ## Document Classification
136
+
137
+ Kreuzberg can automatically classify documents into categories (contracts, forms, invoices, receipts, reports):
138
+
139
+ ```python
140
+ import asyncio
141
+ from kreuzberg import extract_file, ExtractionConfig
142
+
143
+ async def classify_document():
144
+ config = ExtractionConfig(
145
+ auto_detect_document_type=True,
146
+ document_classification_mode="text", # or "vision" for better accuracy
147
+ type_confidence_threshold=0.5,
148
+ )
149
+
150
+ result = await extract_file("invoice.pdf", config=config)
151
+
152
+ # Access classification results
153
+ if result.document_type:
154
+ print(f"Document type: {result.document_type}")
155
+ print(f"Confidence: {result.type_confidence:.2%}")
156
+
157
+ # The extracted content is still available
158
+ print(f"Content: {result.content[:200]}...")
159
+
160
+ asyncio.run(classify_document())
161
+ ```
@@ -0,0 +1,53 @@
1
+ # Automatic Document Classification
2
+
3
+ Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
4
+
5
+ ## Enabling Document Classification
6
+
7
+ To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
8
+
9
+ ```python
10
+ from kreuzberg import ExtractionConfig, extract_file
11
+
12
+ config = ExtractionConfig(auto_detect_document_type=True)
13
+ result = await extract_file("path/to/your/document.pdf", config=config)
14
+
15
+ if result.document_type:
16
+ print(f"Detected document type: {result.document_type}")
17
+ print(f"Confidence: {result.document_type_confidence:.2f}")
18
+ ```
19
+
20
+ ## Classification Modes
21
+
22
+ You can choose between two classification modes using the `document_classification_mode` parameter in `ExtractionConfig`:
23
+
24
+ - `"text"` (default): This mode uses a rule-based classifier that analyzes the extracted text for keywords and patterns. It's fast and works well for text-based documents.
25
+ - `"vision"`: This mode uses layout information from OCR to identify document types. It's more accurate for scanned documents and images, but it requires the Tesseract OCR backend.
26
+
27
+ Here's how to use the vision-based classifier:
28
+
29
+ ```python
30
+ config = ExtractionConfig(
31
+ auto_detect_document_type=True,
32
+ document_classification_mode="vision",
33
+ force_ocr=True, # Recommended for vision-based classification
34
+ )
35
+ ```
36
+
37
+ ## Confidence Threshold
38
+
39
+ You can control the minimum confidence required for a classification to be considered valid by setting the `type_confidence_threshold` in `ExtractionConfig`. The default value is `0.7`.
40
+
41
+ ```python
42
+ config = ExtractionConfig(
43
+ auto_detect_document_type=True,
44
+ type_confidence_threshold=0.85, # Require 85% confidence
45
+ )
46
+ ```
47
+
48
+ ## Output
49
+
50
+ The classification results are available in the `ExtractionResult` object:
51
+
52
+ - `document_type`: The detected document type (e.g., `"invoice"`, `"contract"`) or `None` if no type was detected with sufficient confidence.
53
+ - `type_confidence`: The confidence score of the detection (a float between 0.0 and 1.0) or `None`.
@@ -31,6 +31,9 @@ max_chars = 2000
31
31
  max_overlap = 100
32
32
  ocr_backend = "tesseract"
33
33
  auto_detect_language = true
34
+ auto_detect_document_type = true
35
+ document_classification_mode = "text" # or "vision"
36
+ type_confidence_threshold = 0.5
34
37
 
35
38
  # Tesseract OCR configuration
36
39
  [tesseract]
@@ -76,6 +79,9 @@ force_ocr = false
76
79
  chunk_content = true
77
80
  extract_tables = true
78
81
  auto_detect_language = true
82
+ auto_detect_document_type = true
83
+ document_classification_mode = "text"
84
+ type_confidence_threshold = 0.5
79
85
 
80
86
  [tool.kreuzberg.tesseract]
81
87
  language = "eng"
@@ -8,6 +8,7 @@ This guide provides comprehensive documentation for the Kreuzberg document intel
8
8
  - [Extraction Configuration](extraction-configuration.md) - Configure the extraction process ([API](../api-reference/types.md#extractionconfig))
9
9
  - [Metadata Extraction](metadata-extraction.md) - Document metadata extraction ([API](../api-reference/types.md#metadata))
10
10
  - [Content Chunking](chunking.md) - Split documents into manageable chunks
11
+ - [Document Classification](document-classification.md) - Automatic document type detection
11
12
  - [OCR Configuration](ocr-configuration.md) - Configure OCR settings ([API](../api-reference/ocr-configuration.md))
12
13
  - [OCR Backends](ocr-backends.md) - Choose and configure different OCR engines
13
14
  - [Supported Formats](supported-formats.md) - All supported document formats
@@ -2,9 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
5
- from kreuzberg import MissingDependencyError
6
5
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
7
6
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
7
+ from kreuzberg.exceptions import MissingDependencyError
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from semantic_text_splitter import MarkdownSplitter, TextSplitter
@@ -36,11 +36,11 @@ def get_chunker(
36
36
  if key not in _chunkers:
37
37
  try:
38
38
  if mime_type == MARKDOWN_MIME_TYPE:
39
- from semantic_text_splitter import MarkdownSplitter
39
+ from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
40
40
 
41
41
  _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
42
42
  else:
43
- from semantic_text_splitter import TextSplitter
43
+ from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
46
  except ImportError as e:
@@ -95,7 +95,7 @@ def parse_ocr_backend_config(
95
95
  # Convert psm integer to PSMMode enum if needed
96
96
  processed_config = backend_config.copy()
97
97
  if "psm" in processed_config and isinstance(processed_config["psm"], int):
98
- from kreuzberg._ocr._tesseract import PSMMode
98
+ from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
99
99
 
100
100
  processed_config["psm"] = PSMMode(processed_config["psm"])
101
101
  return TesseractConfig(**processed_config)