kreuzberg 3.13.1__tar.gz → 3.13.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/ci.yaml +5 -5
  2. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/docker-e2e-tests.yml +1 -2
  3. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/docs.yml +1 -1
  4. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/publish-docker.yml +0 -1
  5. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/release.yaml +1 -1
  6. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.gitignore +3 -0
  7. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.pre-commit-config.yaml +8 -7
  8. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/PKG-INFO +8 -8
  9. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/ai-rulez.yaml +164 -170
  10. kreuzberg-3.13.3/docker-logs/docker-info.txt +60 -0
  11. kreuzberg-3.13.3/docker-logs/docker-version.txt +27 -0
  12. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_api/main.py +40 -2
  13. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_config.py +4 -0
  14. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_spread_sheet.py +17 -2
  15. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_ocr/_tesseract.py +14 -3
  16. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/cli.py +2 -2
  17. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/pyproject.toml +11 -11
  18. kreuzberg-3.13.3/test_report.json +16 -0
  19. kreuzberg-3.13.3/tests/api/large_file_test.py +184 -0
  20. kreuzberg-3.13.3/tests/api/mounted_config_test.py +184 -0
  21. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/api/runtime_config_test.py +1 -1
  22. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/document_classification_test.py +15 -15
  23. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/gmft_test.py +1 -1
  24. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/mcp_server_test.py +2 -2
  25. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/easyocr_test.py +0 -9
  26. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/paddleocr_test.py +0 -13
  27. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/tesseract_test.py +0 -2
  28. kreuzberg-3.13.3/tests/regression_api_test.py +81 -0
  29. kreuzberg-3.13.3/tests/regression_test.py +159 -0
  30. kreuzberg-3.13.3/tests/regression_with_config_test.py +145 -0
  31. kreuzberg-3.13.3/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  32. kreuzberg-3.13.3/tests/test_source_files/google-doc-document.pdf +0 -0
  33. kreuzberg-3.13.3/tests/test_source_files/testXls.xls +0 -0
  34. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/ocr_cache_test.py +0 -37
  35. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/table_test.py +21 -21
  36. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/uv.lock +413 -339
  37. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.commitlintrc +0 -0
  38. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.deepsource.toml +0 -0
  39. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.docker/Dockerfile +0 -0
  40. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.docker/README.md +0 -0
  41. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.dockerignore +0 -0
  42. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/dependabot.yaml +0 -0
  43. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/pr-title.yaml +0 -0
  44. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.github/workflows/test-docker-builds.yml +0 -0
  45. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/.markdownlint.yaml +0 -0
  46. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/LICENSE +0 -0
  47. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/README.md +0 -0
  48. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/Taskfile.yml +0 -0
  49. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/README.md +0 -0
  50. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/__init__.py +0 -0
  51. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/pyproject.toml +0 -0
  52. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/__init__.py +0 -0
  53. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/__main__.py +0 -0
  54. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/benchmarks.py +0 -0
  55. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/cli.py +0 -0
  56. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/models.py +0 -0
  57. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/profiler.py +0 -0
  58. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/benchmarks/src/runner.py +0 -0
  59. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docker-compose.example.yml +0 -0
  60. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/advanced/custom-extractors.md +0 -0
  61. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/advanced/custom-hooks.md +0 -0
  62. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/advanced/error-handling.md +0 -0
  63. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/advanced/index.md +0 -0
  64. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/advanced/performance.md +0 -0
  65. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/api-reference/exceptions.md +0 -0
  66. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/api-reference/extraction-functions.md +0 -0
  67. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/api-reference/extractor-registry.md +0 -0
  68. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/api-reference/index.md +0 -0
  69. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/api-reference/ocr-configuration.md +0 -0
  70. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/api-reference/types.md +0 -0
  71. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/assets/favicon.png +0 -0
  72. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/assets/logo.png +0 -0
  73. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/cli.md +0 -0
  74. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/contributing.md +0 -0
  75. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/css/extra.css +0 -0
  76. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/examples/extraction-examples.md +0 -0
  77. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/examples/index.md +0 -0
  78. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/getting-started/index.md +0 -0
  79. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/getting-started/installation.md +0 -0
  80. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/getting-started/quick-start.md +0 -0
  81. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/index.md +0 -0
  82. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/api-server.md +0 -0
  83. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/basic-usage.md +0 -0
  84. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/chunking.md +0 -0
  85. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/docker.md +0 -0
  86. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/document-classification.md +0 -0
  87. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/extraction-configuration.md +0 -0
  88. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/index.md +0 -0
  89. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/mcp-server.md +0 -0
  90. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/metadata-extraction.md +0 -0
  91. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/ocr-backends.md +0 -0
  92. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/ocr-configuration.md +0 -0
  93. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/docs/user-guide/supported-formats.md +0 -0
  94. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/__init__.py +0 -0
  95. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/__main__.py +0 -0
  96. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_api/__init__.py +0 -0
  97. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_chunker.py +0 -0
  98. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_constants.py +0 -0
  99. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_document_classification.py +0 -0
  100. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_entity_extraction.py +0 -0
  101. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/__init__.py +0 -0
  102. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_base.py +0 -0
  103. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_email.py +0 -0
  104. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_html.py +0 -0
  105. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_image.py +0 -0
  106. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_pandoc.py +0 -0
  107. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_pdf.py +0 -0
  108. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_presentation.py +0 -0
  109. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_extractors/_structured.py +0 -0
  110. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_gmft.py +0 -0
  111. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_language_detection.py +0 -0
  112. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_mcp/__init__.py +0 -0
  113. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_mcp/server.py +0 -0
  114. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_mime_types.py +0 -0
  115. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_ocr/__init__.py +0 -0
  116. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_ocr/_base.py +0 -0
  117. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_ocr/_easyocr.py +0 -0
  118. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_ocr/_paddleocr.py +0 -0
  119. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_ocr/_table_extractor.py +0 -0
  120. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_playa.py +0 -0
  121. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_registry.py +0 -0
  122. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_types.py +0 -0
  123. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/__init__.py +0 -0
  124. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_cache.py +0 -0
  125. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_device.py +0 -0
  126. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_document_cache.py +0 -0
  127. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_errors.py +0 -0
  128. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_ocr_cache.py +0 -0
  129. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_pdf_lock.py +0 -0
  130. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_process_pool.py +0 -0
  131. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_quality.py +0 -0
  132. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_ref.py +0 -0
  133. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_serialization.py +0 -0
  134. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_string.py +0 -0
  135. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_sync.py +0 -0
  136. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_table.py +0 -0
  137. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/_utils/_tmp.py +0 -0
  138. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/exceptions.py +0 -0
  139. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/extraction.py +0 -0
  140. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/kreuzberg/py.typed +0 -0
  141. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/mkdocs.yaml +0 -0
  142. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/output.txt +0 -0
  143. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/results/baseline.json +0 -0
  144. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/results/serialization.json +0 -0
  145. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/results/statistical.json +0 -0
  146. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/__init__.py +0 -0
  147. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/api/__init__.py +0 -0
  148. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/api/conftest.py +0 -0
  149. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/api/main_test.py +0 -0
  150. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/chunker_test.py +0 -0
  151. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/cli_command_test.py +0 -0
  152. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/cli_integration_test.py +0 -0
  153. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/cli_test.py +0 -0
  154. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/config_test.py +0 -0
  155. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/conftest.py +0 -0
  156. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/e2e/__init__.py +0 -0
  157. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/e2e/docker_e2e_test.py +0 -0
  158. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/entity_extraction_test.py +0 -0
  159. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/exceptions_test.py +0 -0
  160. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extraction_batch_test.py +0 -0
  161. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extraction_test.py +0 -0
  162. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/__init__.py +0 -0
  163. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/email_test.py +0 -0
  164. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/html_test.py +0 -0
  165. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/image_test.py +0 -0
  166. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/pandoc_metadata_test.py +0 -0
  167. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/pandoc_test.py +0 -0
  168. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/pdf_test.py +0 -0
  169. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/presentation_test.py +0 -0
  170. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/spreed_sheet_test.py +0 -0
  171. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/extractors/structured_test.py +0 -0
  172. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/gmft_extended_test.py +0 -0
  173. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/hooks_test.py +0 -0
  174. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/html_to_markdown_config_test.py +0 -0
  175. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/language_detection_test.py +0 -0
  176. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/mime_types_test.py +0 -0
  177. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/multiprocessing/__init__.py +0 -0
  178. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/multiprocessing/gmft_integration_test.py +0 -0
  179. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  180. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/multiprocessing/process_manager_test.py +0 -0
  181. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  182. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/__init__.py +0 -0
  183. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/base_test.py +0 -0
  184. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/device_integration_test.py +0 -0
  185. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/init_test.py +0 -0
  186. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/tesseract_tsv_integration_test.py +0 -0
  187. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/ocr/tesseract_tsv_test.py +0 -0
  188. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/playa_helpers_test.py +0 -0
  189. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/playa_test.py +0 -0
  190. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/registry_test.py +0 -0
  191. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/tesseract_sync_formats_test.py +0 -0
  192. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/contract.txt +0 -0
  193. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/contract_test.txt +0 -0
  194. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/document.docx +0 -0
  195. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/email/sample-email.eml +0 -0
  196. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  197. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/excel.xlsx +0 -0
  198. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/flower-no-text.jpg +0 -0
  199. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/form_test.txt +0 -0
  200. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/french-text.txt +0 -0
  201. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/german-text.txt +0 -0
  202. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/html.html +0 -0
  203. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/images/test_hello_world.png +0 -0
  204. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/invoice_image.png +0 -0
  205. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/invoice_test.txt +0 -0
  206. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/json/sample-document.json +0 -0
  207. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  208. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/markdown.md +0 -0
  209. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/non-ascii-text.pdf +0 -0
  210. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/non-searchable.pdf +0 -0
  211. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/ocr-image.jpg +0 -0
  212. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  213. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  214. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  215. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  216. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/receipt_test.txt +0 -0
  217. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/report_test.txt +0 -0
  218. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/sample-contract.pdf +0 -0
  219. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/scanned.pdf +0 -0
  220. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/searchable.pdf +0 -0
  221. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/spanish-text.txt +0 -0
  222. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/tables/borderless_table.png +0 -0
  223. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/tables/complex_document.png +0 -0
  224. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/tables/simple_table.png +0 -0
  225. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/test-article.pdf +0 -0
  226. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  227. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/types_test.py +0 -0
  228. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/__init__.py +0 -0
  229. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/cache_test.py +0 -0
  230. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/device_test.py +0 -0
  231. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/errors_test.py +0 -0
  232. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/pdf_lock_test.py +0 -0
  233. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/process_pool_test.py +0 -0
  234. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/ref_test.py +0 -0
  235. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/serialization_test.py +0 -0
  236. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/string_test.py +0 -0
  237. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/sync_test.py +0 -0
  238. {kreuzberg-3.13.1 → kreuzberg-3.13.3}/tests/utils/tmp_test.py +0 -0
@@ -23,7 +23,7 @@ jobs:
23
23
  enable-cache: true
24
24
 
25
25
  - name: Set up Python
26
- uses: actions/setup-python@v5
26
+ uses: actions/setup-python@v6
27
27
  with:
28
28
  python-version-file: "pyproject.toml"
29
29
 
@@ -54,7 +54,7 @@ jobs:
54
54
  coverage:
55
55
  needs: validate
56
56
  runs-on: ubuntu-latest
57
- timeout-minutes: 20
57
+ timeout-minutes: 120
58
58
  steps:
59
59
  - name: Checkout
60
60
  uses: actions/checkout@v5
@@ -65,7 +65,7 @@ jobs:
65
65
  enable-cache: true
66
66
 
67
67
  - name: Install Python
68
- uses: actions/setup-python@v5
68
+ uses: actions/setup-python@v6
69
69
  id: setup-python
70
70
  with:
71
71
  python-version: "3.13"
@@ -142,7 +142,7 @@ jobs:
142
142
  matrix:
143
143
  os: [ubuntu-latest, windows-latest, macos-latest]
144
144
  python: ["3.10", "3.11", "3.12", "3.13"]
145
- timeout-minutes: 30
145
+ timeout-minutes: 120
146
146
  steps:
147
147
  - name: Checkout
148
148
  uses: actions/checkout@v5
@@ -153,7 +153,7 @@ jobs:
153
153
  enable-cache: true
154
154
 
155
155
  - name: Install Python
156
- uses: actions/setup-python@v5
156
+ uses: actions/setup-python@v6
157
157
  id: setup-python
158
158
  with:
159
159
  python-version: ${{ matrix.python }}
@@ -25,7 +25,7 @@ jobs:
25
25
  enable-cache: true
26
26
 
27
27
  - name: Set up Python
28
- uses: actions/setup-python@v5
28
+ uses: actions/setup-python@v6
29
29
  with:
30
30
  python-version-file: "pyproject.toml"
31
31
 
@@ -48,7 +48,6 @@ jobs:
48
48
  sudo rm -rf /opt/ghc
49
49
  sudo rm -rf /opt/hostedtoolcache/CodeQL
50
50
  sudo rm -rf /usr/local/share/boost
51
- sudo rm -rf "$AGENT_TOOLSDIRECTORY"
52
51
 
53
52
  sudo apt-get clean
54
53
  sudo apt-get autoremove -y
@@ -29,7 +29,7 @@ jobs:
29
29
  fetch-depth: 0
30
30
 
31
31
  - name: Setup Python
32
- uses: actions/setup-python@v5
32
+ uses: actions/setup-python@v6
33
33
  with:
34
34
  python-version: '3.11'
35
35
 
@@ -62,7 +62,6 @@ jobs:
62
62
  sudo rm -rf /usr/local/lib/node_modules
63
63
  sudo rm -rf /opt/microsoft
64
64
  sudo rm -rf /usr/local/.ghcup
65
- sudo rm -rf /opt/hostedtoolcache
66
65
 
67
66
  # Clean apt
68
67
  sudo apt-get clean
@@ -21,7 +21,7 @@ jobs:
21
21
  enable-cache: true
22
22
 
23
23
  - name: Set up Python
24
- uses: actions/setup-python@v5
24
+ uses: actions/setup-python@v6
25
25
  with:
26
26
  python-version-file: "pyproject.toml"
27
27
 
@@ -66,3 +66,6 @@ yarn-error.log*
66
66
  *.tmp
67
67
  *.temp
68
68
  .tmp/
69
+
70
+ # AI Rules generated files
71
+ .claude/agents/
@@ -5,11 +5,12 @@ repos:
5
5
  - id: commitlint
6
6
  stages: [commit-msg]
7
7
  additional_dependencies: ["@commitlint/config-conventional"]
8
- - repo: https://github.com/Goldziher/ai-rulez
9
- rev: v1.6.1
10
- hooks:
11
- - id: ai-rulez-validate
12
- - id: ai-rulez-generate
8
+ # Temporarily disabled due to CI environment issues
9
+ # - repo: https://github.com/Goldziher/ai-rulez
10
+ # rev: v2.0.1
11
+ # hooks:
12
+ # - id: ai-rulez-validate
13
+ # - id: ai-rulez-generate
13
14
  - repo: https://github.com/pre-commit/pre-commit-hooks
14
15
  rev: v6.0.0
15
16
  hooks:
@@ -37,7 +38,7 @@ repos:
37
38
  hooks:
38
39
  - id: markdownlint-fix
39
40
  - repo: https://github.com/adamchainz/blacken-docs
40
- rev: 1.19.1
41
+ rev: 1.20.0
41
42
  hooks:
42
43
  - id: blacken-docs
43
44
  args: ["--pyi", "--line-length", "130"]
@@ -53,7 +54,7 @@ repos:
53
54
  hooks:
54
55
  - id: pyproject-fmt
55
56
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.11
57
+ rev: v0.12.12
57
58
  hooks:
58
59
  - id: ruff
59
60
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.13.1
3
+ Version: 3.13.3
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,15 +31,15 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.9.1
34
+ Requires-Dist: html-to-markdown[lxml]>=1.10.0
35
35
  Requires-Dist: mcp>=1.13.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: numpy>=1.24.0
38
38
  Requires-Dist: playa-pdf>=0.7.0
39
- Requires-Dist: polars>=1.33.0
39
+ Requires-Dist: polars>=1.33.1
40
40
  Requires-Dist: psutil>=7.0.0
41
41
  Requires-Dist: pypdfium2==4.30.0
42
- Requires-Dist: python-calamine>=0.5.2
42
+ Requires-Dist: python-calamine>=0.5.3
43
43
  Requires-Dist: python-pptx>=1.0.2
44
44
  Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
45
45
  Provides-Extra: additional-extensions
@@ -55,17 +55,17 @@ Requires-Dist: keybert>=0.9.0; extra == 'all'
55
55
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
56
56
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
57
57
  Requires-Dist: paddleocr>=3.2.0; extra == 'all'
58
- Requires-Dist: paddlepaddle>=3.1.1; extra == 'all'
58
+ Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
59
59
  Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
60
60
  Requires-Dist: rich>=14.1.0; extra == 'all'
61
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
61
+ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
62
62
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
63
63
  Requires-Dist: spacy>=3.8.7; extra == 'all'
64
64
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
65
65
  Provides-Extra: api
66
66
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
67
67
  Provides-Extra: chunking
68
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
68
+ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
69
69
  Provides-Extra: cli
70
70
  Requires-Dist: click>=8.2.1; extra == 'cli'
71
71
  Requires-Dist: rich>=14.1.0; extra == 'cli'
@@ -85,7 +85,7 @@ Provides-Extra: langdetect
85
85
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
86
86
  Provides-Extra: paddleocr
87
87
  Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
88
- Requires-Dist: paddlepaddle>=3.1.1; extra == 'paddleocr'
88
+ Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
89
89
  Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
90
90
  Description-Content-Type: text/markdown
91
91
 
@@ -1,115 +1,8 @@
1
- metadata:
2
- name: "Kreuzberg"
3
- version: "3.5.0"
4
- description: "A text extraction library supporting PDFs, images, office documents and more"
5
-
6
- targets:
7
- claude-files:
8
- - "CLAUDE.md"
9
- - ".claude/**/*.md"
10
-
11
- cursor-files:
12
- - ".cursorrules"
13
-
14
- windsurf-files:
15
- - ".windsurfrules"
16
-
17
- gemini-files:
18
- - "GEMINI.md"
19
-
20
- outputs:
21
- - file: "CLAUDE.md"
22
- template: |
23
- # CLAUDE.md
24
-
25
- This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
26
-
27
- ## Project Overview
28
-
29
- {{ .Description }}
30
-
31
- {{ range .Rules }}
32
- ## {{ .Name }}
33
- {{ .Content }}
34
- {{ end }}
35
-
36
- {{ range .Sections }}
37
- ## {{ .Title }}
38
- {{ .Content }}
39
- {{ end }}
40
-
41
- - file: "GEMINI.md"
42
- template: |
43
- # GEMINI.md
44
-
45
- This file provides guidance to Gemini when working with code in this repository.
46
-
47
- ## Project Overview
48
-
49
- {{ .Description }}
50
-
51
- {{ range .Rules }}
52
- ## {{ .Name }}
53
- {{ .Content }}
54
- {{ end }}
55
-
56
- {{ range .Sections }}
57
- ## {{ .Title }}
58
- {{ .Content }}
59
- {{ end }}
60
-
61
- - file: ".cursorrules"
62
- template: |
63
- # Kreuzberg
64
- A text extraction library supporting PDFs, images, office documents and more
65
-
66
- {{ range .Rules }}
67
- ## {{ .Name }}
68
- {{ .Content }}
69
- {{ end }}
70
-
71
- {{ range .Sections }}
72
- ## {{ .Title }}
73
- {{ .Content }}
74
- {{ end }}
75
-
76
- - file: ".windsurfrules"
77
- template: |
78
- # Kreuzberg
79
- A text extraction library supporting PDFs, images, office documents and more
80
-
81
- {{ range .Rules }}
82
- ## {{ .Name }}
83
- {{ .Content }}
84
- {{ end }}
85
-
86
- {{ range .Sections }}
87
- ## {{ .Title }}
88
- {{ .Content }}
89
- {{ end }}
90
-
91
- - path: ".claude/agents/"
92
- type: "agent"
93
- naming_scheme: "{priority:02d}-{name}.md"
94
- template: |
95
- # Agent: {{ .Name }}
96
-
97
- **Description**: {{ .Description }}
98
- **Priority**: {{ .Priority }}
99
-
100
- ## Tools Available
101
- {{ range .Tools }}
102
- - {{ . }}
103
- {{ end }}
104
-
105
- ## System Prompt
106
- {{ .SystemPrompt }}
107
-
108
1
  agents:
109
- - name: "python-engineer"
110
- description: "Expert Python developer for implementing features and fixing bugs"
111
- priority: 10
112
- targets: ["@claude-files"]
2
+ - description: Expert Python developer for implementing features and fixing bugs
3
+ id: python-engineer
4
+ name: python-engineer
5
+ priority: critical
113
6
  system_prompt: |
114
7
  You are an expert Python engineer specializing in the Kreuzberg text extraction library.
115
8
 
@@ -138,11 +31,13 @@ agents:
138
31
  - ALWAYS prefer editing existing files
139
32
  - All builtin imports at top level (except cyclical/optional)
140
33
  - Maintain 95% test coverage requirement
141
-
142
- - name: "test-engineer"
143
- description: "Writes comprehensive tests for kreuzberg functionality"
144
- priority: 9
145
- targets: ["@claude-files"]
34
+ targets:
35
+ - CLAUDE.md
36
+ - .claude/**/*.md
37
+ - description: Writes comprehensive tests for kreuzberg functionality
38
+ id: test-engineer
39
+ name: test-engineer
40
+ priority: high
146
41
  system_prompt: |
147
42
  You are a test automation expert for the Kreuzberg text extraction library.
148
43
 
@@ -173,16 +68,13 @@ agents:
173
68
  - Use descriptive names that explain what's being tested
174
69
 
175
70
  Always ensure tests are isolated, repeatable, and fast.
176
-
177
- - name: "code-reviewer"
178
- description: "Reviews code for quality, security, and compliance"
179
- priority: 10
180
- targets: ["@claude-files"]
181
- tools:
182
- - "Read"
183
- - "Grep"
184
- - "Glob"
185
- - "LS"
71
+ targets:
72
+ - CLAUDE.md
73
+ - .claude/**/*.md
74
+ - description: Reviews code for quality, security, and compliance
75
+ id: code-reviewer
76
+ name: code-reviewer
77
+ priority: critical
186
78
  system_prompt: |
187
79
  You are a senior code reviewer for the Kreuzberg text extraction library.
188
80
 
@@ -231,11 +123,18 @@ agents:
231
123
 
232
124
  Provide specific, actionable feedback with code examples when relevant.
233
125
  Rate severity: Critical, High, Medium, Low.
234
-
235
- - name: "infra-engineer"
236
- description: "Manages infrastructure, CI/CD, Docker, and deployment configurations"
237
- priority: 9
238
- targets: ["@claude-files"]
126
+ targets:
127
+ - CLAUDE.md
128
+ - .claude/**/*.md
129
+ tools:
130
+ - Read
131
+ - Grep
132
+ - Glob
133
+ - LS
134
+ - description: Manages infrastructure, CI/CD, Docker, and deployment configurations
135
+ id: infra-engineer
136
+ name: infra-engineer
137
+ priority: high
239
138
  system_prompt: |
240
139
  You are an infrastructure and DevOps engineer for the Kreuzberg text extraction library.
241
140
 
@@ -300,11 +199,13 @@ agents:
300
199
  - Security first approach
301
200
  - Resource optimization
302
201
  - Clear documentation
303
-
304
- - name: "performance-optimizer"
305
- description: "Optimizes performance and resource usage"
306
- priority: 8
307
- targets: ["@claude-files"]
202
+ targets:
203
+ - CLAUDE.md
204
+ - .claude/**/*.md
205
+ - description: Optimizes performance and resource usage
206
+ id: performance-optimizer
207
+ name: performance-optimizer
208
+ priority: medium
308
209
  system_prompt: |
309
210
  You are a performance optimization specialist for the Kreuzberg text extraction library.
310
211
 
@@ -349,11 +250,108 @@ agents:
349
250
  Always measure before and after optimization.
350
251
  Focus on bottlenecks identified through profiling.
351
252
  Ensure optimizations don't break functionality or reduce code clarity.
352
-
253
+ targets:
254
+ - CLAUDE.md
255
+ - .claude/**/*.md
256
+ metadata:
257
+ description: A text extraction library supporting PDFs, images, office documents and more
258
+ name: Kreuzberg
259
+ version: 3.5.0
260
+ outputs:
261
+ - path: CLAUDE.md
262
+ template:
263
+ type: inline
264
+ value: |
265
+ # CLAUDE.md
266
+
267
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
268
+
269
+ ## Project Overview
270
+
271
+ {{ .Description }}
272
+
273
+ {{ range .Rules }}
274
+ ## {{ .Name }}
275
+ {{ .Content }}
276
+ {{ end }}
277
+
278
+ {{ range .Sections }}
279
+ ## {{ .Name }}
280
+ {{ .Content }}
281
+ {{ end }}
282
+ - path: GEMINI.md
283
+ template:
284
+ type: inline
285
+ value: |
286
+ # GEMINI.md
287
+
288
+ This file provides guidance to Gemini when working with code in this repository.
289
+
290
+ ## Project Overview
291
+
292
+ {{ .Description }}
293
+
294
+ {{ range .Rules }}
295
+ ## {{ .Name }}
296
+ {{ .Content }}
297
+ {{ end }}
298
+
299
+ {{ range .Sections }}
300
+ ## {{ .Name }}
301
+ {{ .Content }}
302
+ {{ end }}
303
+ - path: .cursorrules
304
+ template:
305
+ type: inline
306
+ value: |
307
+ # Kreuzberg
308
+ A text extraction library supporting PDFs, images, office documents and more
309
+
310
+ {{ range .Rules }}
311
+ ## {{ .Name }}
312
+ {{ .Content }}
313
+ {{ end }}
314
+
315
+ {{ range .Sections }}
316
+ ## {{ .Name }}
317
+ {{ .Content }}
318
+ {{ end }}
319
+ - path: .windsurfrules
320
+ template:
321
+ type: inline
322
+ value: |
323
+ # Kreuzberg
324
+ A text extraction library supporting PDFs, images, office documents and more
325
+
326
+ {{ range .Rules }}
327
+ ## {{ .Name }}
328
+ {{ .Content }}
329
+ {{ end }}
330
+
331
+ {{ range .Sections }}
332
+ ## {{ .Name }}
333
+ {{ .Content }}
334
+ {{ end }}
335
+ - naming_scheme: "{priority:02d}-{name}.md"
336
+ path: .claude/agents/
337
+ template:
338
+ type: inline
339
+ value: |
340
+ # Agent: {{ .Name }}
341
+
342
+ **Description**: {{ .Description }}
343
+ **Priority**: {{ .Priority }}
344
+
345
+ ## Tools Available
346
+ {{ range .Tools }}
347
+ - {{ . }}
348
+ {{ end }}
349
+
350
+ ## System Prompt
351
+ {{ .SystemPrompt }}
352
+ type: agent
353
353
  rules:
354
- - name: "Development Commands"
355
- priority: 10
356
- content: |
354
+ - content: |
357
355
  ### Dependencies
358
356
  - Install dependencies: `uv sync`
359
357
  - Install with all optional dependencies: `uv sync --all-extras`
@@ -378,10 +376,9 @@ rules:
378
376
  - Build docs: `uv run mkdocs build --clean --strict`
379
377
  - Serve docs locally: `uv run mkdocs serve`
380
378
  - Install doc dependencies: `uv sync --group doc`
381
-
382
- - name: "Important Instructions"
383
- priority: 10
384
- content: |
379
+ name: Development Commands
380
+ priority: critical
381
+ - content: |
385
382
  - Do what has been asked; nothing more, nothing less
386
383
  - NEVER create files unless they're absolutely necessary for achieving your goal
387
384
  - ALWAYS prefer editing an existing file to creating a new one
@@ -389,10 +386,9 @@ rules:
389
386
  - Only create documentation files if explicitly requested by the User
390
387
  - All builtin imports should be at the top level (except for cyclical or optional dependencies)
391
388
  - When committing, always use the format specified in the CLAUDE.md
392
-
393
- - name: "Architecture"
394
- priority: 9
395
- content: |
389
+ name: Important Instructions
390
+ priority: critical
391
+ - content: |
396
392
  ### Core Extraction Flow
397
393
  1. **Entry Point**: `extraction.py` provides main functions (`extract_file`, `extract_bytes`, etc.)
398
394
  2. **Registry Pattern**: `ExtractorRegistry` selects appropriate extractor based on MIME type
@@ -415,10 +411,9 @@ rules:
415
411
  - New OCR backends: Implement `BaseOCR` interface
416
412
  - Configuration options: Extend `ExtractionConfig` or create backend-specific configs
417
413
  - Hooks: Use `pre_validate_hook` and `post_extraction_hook` for custom processing
418
-
419
- - name: "Error Handling"
420
- priority: 8
421
- content: |
414
+ name: Architecture
415
+ priority: high
416
+ - content: |
422
417
  - **Exception-based**: All errors are raised as exceptions, no tuple returns
423
418
  - **Base class**: All exceptions inherit from `KreuzbergError`
424
419
  - **Context**: Exceptions include a `context` parameter with debugging information
@@ -427,20 +422,18 @@ rules:
427
422
  - `ParsingError`: Document parsing failures
428
423
  - `OCRError`: OCR processing failures
429
424
  - `MissingDependencyError`: Missing optional dependencies
430
-
431
- - name: "Testing Patterns"
432
- priority: 7
433
- content: |
425
+ name: Error Handling
426
+ priority: medium
427
+ - content: |
434
428
  - Test files in `tests/test_source_files/` for various formats
435
429
  - Mock OCR responses for predictable testing
436
430
  - Both sync and async test variants
437
431
  - Comprehensive error case coverage
438
432
  - OCR tests marked as `xfail` in CI environments for resilience
439
433
  - Integration tests use timeouts and retry logic where appropriate
440
-
441
- - name: "CI/CD and Deployment"
442
- priority: 6
443
- content: |
434
+ name: Testing Patterns
435
+ priority: low
436
+ - content: |
444
437
  ### GitHub Actions Workflows
445
438
  - **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
446
439
  - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
@@ -462,10 +455,9 @@ rules:
462
455
  - **Docker version detection**: Use `git tag --sort=-version:refname | head -n1` not `git describe`
463
456
  - **Docs dependencies**: Use `uv sync --group doc` for proper mkdocs-material[imaging] support
464
457
  - **Docker Hub README**: May fail due to permissions, use `continue-on-error: true`
465
-
466
- - name: "Package Management"
467
- priority: 6
468
- content: |
458
+ name: CI/CD and Deployment
459
+ priority: minimal
460
+ - content: |
469
461
  ### Optional Dependencies Structure
470
462
  ```toml
471
463
  [project.optional-dependencies]
@@ -499,10 +491,10 @@ rules:
499
491
  - **Usage**: Required for PDFs with AES encryption (RC4 is supported in base installation)
500
492
  - **Password Support**: Supports single password or list of passwords to try in sequence
501
493
  - **Size Impact**: Increases installation size by ~24MB due to cryptography package
502
-
494
+ name: Package Management
495
+ priority: minimal
503
496
  sections:
504
- - title: "Language Detection"
505
- content: |
497
+ - content: |
506
498
  ### Automatic Language Detection (v3.5.0+)
507
499
  - **Feature**: Automatically detect languages in extracted text
508
500
  - **Implementation**: Uses fast-langdetect library for high-performance detection
@@ -511,9 +503,9 @@ sections:
511
503
  - Configure via `LanguageDetectionConfig` for confidence thresholds
512
504
  - **Output**: Results available in `ExtractionResult.detected_languages`
513
505
  - **Integration**: Works with all extraction methods and file types
514
-
515
- - title: "Planned Features"
516
- content: |
506
+ id: language-detection
507
+ name: Language Detection
508
+ - content: |
517
509
  ### Structured Extraction (Issue #55)
518
510
  - **Goal**: Enable extraction of structured data from documents using vision models
519
511
  - **Implementation**:
@@ -522,3 +514,5 @@ sections:
522
514
  - Use LiteLLM for vision model integration
523
515
  - Implement retry logic with error feedback
524
516
  - **Configuration**: Currently programmatic only (no `kreuzberg.toml`)
517
+ id: planned-features
518
+ name: Planned Features
@@ -0,0 +1,60 @@
1
+ Client: Docker Engine - Community
2
+ Version: 28.0.4
3
+ Context: default
4
+ Debug Mode: false
5
+ Plugins:
6
+ buildx: Docker Buildx (Docker Inc.)
7
+ Version: v0.27.0
8
+ Path: /usr/libexec/docker/cli-plugins/docker-buildx
9
+ compose: Docker Compose (Docker Inc.)
10
+ Version: v2.38.2
11
+ Path: /usr/libexec/docker/cli-plugins/docker-compose
12
+
13
+ Server:
14
+ Containers: 1
15
+ Running: 1
16
+ Paused: 0
17
+ Stopped: 0
18
+ Images: 2
19
+ Server Version: 28.0.4
20
+ Storage Driver: overlay2
21
+ Backing Filesystem: extfs
22
+ Supports d_type: true
23
+ Using metacopy: false
24
+ Native Overlay Diff: false
25
+ userxattr: false
26
+ Logging Driver: json-file
27
+ Cgroup Driver: systemd
28
+ Cgroup Version: 2
29
+ Plugins:
30
+ Volume: local
31
+ Network: bridge host ipvlan macvlan null overlay
32
+ Log: awslogs fluentd gcplogs gelf journald json-file local splunk syslog
33
+ Swarm: inactive
34
+ Runtimes: io.containerd.runc.v2 runc
35
+ Default Runtime: runc
36
+ Init Binary: docker-init
37
+ containerd version: 05044ec0a9a75232cad458027ca83437aae3f4da
38
+ runc version: v1.2.5-0-g59923ef
39
+ init version: de40ad0
40
+ Security Options:
41
+ apparmor
42
+ seccomp
43
+ Profile: builtin
44
+ cgroupns
45
+ Kernel Version: 6.11.0-1018-azure
46
+ Operating System: Ubuntu 24.04.3 LTS
47
+ OSType: linux
48
+ Architecture: x86_64
49
+ CPUs: 4
50
+ Total Memory: 15.62GiB
51
+ Name: pkrvm7jw40e0xgp
52
+ ID: 33a18c03-7dc8-4ab9-bfe1-99342b7c1aaf
53
+ Docker Root Dir: /var/lib/docker
54
+ Debug Mode: false
55
+ Username: githubactions
56
+ Experimental: false
57
+ Insecure Registries:
58
+ ::1/128
59
+ 127.0.0.0/8
60
+ Live Restore Enabled: false