kreuzberg 3.9.1__tar.gz → 3.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/PKG-INFO +4 -1
  2. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/ai-rulez.yaml +11 -1
  3. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_pdf.py +76 -5
  4. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_playa.py +6 -4
  5. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_types.py +2 -0
  6. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/pyproject.toml +3 -2
  7. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/pdf_test.py +48 -0
  8. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/uv.lock +64 -7
  9. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.commitlintrc +0 -0
  10. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.deepsource.toml +0 -0
  11. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.docker/Dockerfile +0 -0
  12. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.docker/README.md +0 -0
  13. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.dockerignore +0 -0
  14. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/dependabot.yaml +0 -0
  15. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/ci.yaml +0 -0
  16. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/docs.yml +0 -0
  17. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/pr-title.yaml +0 -0
  18. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/publish-docker.yml +0 -0
  19. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/release.yaml +0 -0
  20. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.gitignore +0 -0
  21. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.markdownlint.yaml +0 -0
  22. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.pre-commit-config.yaml +0 -0
  23. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/LICENSE +0 -0
  24. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/README.md +0 -0
  25. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/README.md +0 -0
  26. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/benchmark_baseline.py +0 -0
  27. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/end_to_end_benchmark.py +0 -0
  28. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/final_benchmark.py +0 -0
  29. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/pyproject.toml +0 -0
  30. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/baseline_results.json +0 -0
  31. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  32. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  33. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/final_benchmark_results.json +0 -0
  34. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/latest.json +0 -0
  35. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/mime_caching_results.json +0 -0
  36. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  37. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/ocr_caching_results.json +0 -0
  38. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  39. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  40. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/table_caching_results.json +0 -0
  41. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/serialization_benchmark.py +0 -0
  42. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  43. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  44. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  45. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  46. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  47. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  48. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  49. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/statistical_benchmark.py +0 -0
  50. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/custom-extractors.md +0 -0
  51. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/custom-hooks.md +0 -0
  52. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/error-handling.md +0 -0
  53. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/index.md +0 -0
  54. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/performance.md +0 -0
  55. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/exceptions.md +0 -0
  56. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/extraction-functions.md +0 -0
  57. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/extractor-registry.md +0 -0
  58. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/index.md +0 -0
  59. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/ocr-configuration.md +0 -0
  60. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/types.md +0 -0
  61. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/assets/favicon.png +0 -0
  62. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/assets/logo.png +0 -0
  63. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/changelog.md +0 -0
  64. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/cli.md +0 -0
  65. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/contributing.md +0 -0
  66. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/css/extra.css +0 -0
  67. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/examples/extraction-examples.md +0 -0
  68. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/examples/index.md +0 -0
  69. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/getting-started/index.md +0 -0
  70. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/getting-started/installation.md +0 -0
  71. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/getting-started/quick-start.md +0 -0
  72. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/index.md +0 -0
  73. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/performance-analysis.md +0 -0
  74. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/api-server.md +0 -0
  75. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/basic-usage.md +0 -0
  76. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/chunking.md +0 -0
  77. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/docker.md +0 -0
  78. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/document-classification.md +0 -0
  79. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/extraction-configuration.md +0 -0
  80. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/index.md +0 -0
  81. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/mcp-server.md +0 -0
  82. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/metadata-extraction.md +0 -0
  83. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/ocr-backends.md +0 -0
  84. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/ocr-configuration.md +0 -0
  85. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/supported-formats.md +0 -0
  86. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/__init__.py +0 -0
  87. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/__main__.py +0 -0
  88. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_api/__init__.py +0 -0
  89. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_api/main.py +0 -0
  90. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_chunker.py +0 -0
  91. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_config.py +0 -0
  92. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_constants.py +0 -0
  93. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_document_classification.py +0 -0
  94. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_entity_extraction.py +0 -0
  95. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/__init__.py +0 -0
  96. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_base.py +0 -0
  97. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_email.py +0 -0
  98. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_html.py +0 -0
  99. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_image.py +0 -0
  100. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  101. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_presentation.py +0 -0
  102. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  103. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_structured.py +0 -0
  104. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_gmft.py +0 -0
  105. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_language_detection.py +0 -0
  106. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_mcp/__init__.py +0 -0
  107. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_mcp/server.py +0 -0
  108. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_mime_types.py +0 -0
  109. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/__init__.py +0 -0
  110. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_base.py +0 -0
  111. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_easyocr.py +0 -0
  112. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  113. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_tesseract.py +0 -0
  114. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_registry.py +0 -0
  115. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/__init__.py +0 -0
  116. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_cache.py +0 -0
  117. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_device.py +0 -0
  118. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_document_cache.py +0 -0
  119. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_errors.py +0 -0
  120. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  121. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_process_pool.py +0 -0
  122. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_quality.py +0 -0
  123. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_serialization.py +0 -0
  124. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_string.py +0 -0
  125. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_sync.py +0 -0
  126. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_table.py +0 -0
  127. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_tmp.py +0 -0
  128. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/cli.py +0 -0
  129. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/exceptions.py +0 -0
  130. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/extraction.py +0 -0
  131. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/py.typed +0 -0
  132. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/mkdocs.yaml +0 -0
  133. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/__init__.py +0 -0
  134. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/api/__init__.py +0 -0
  135. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/api/main_test.py +0 -0
  136. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/chunker_test.py +0 -0
  137. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/cli_integration_test.py +0 -0
  138. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/cli_test.py +0 -0
  139. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/config_test.py +0 -0
  140. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/conftest.py +0 -0
  141. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/document_classification_test.py +0 -0
  142. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/entity_extraction_test.py +0 -0
  143. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/exceptions_test.py +0 -0
  144. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extraction_batch_test.py +0 -0
  145. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extraction_test.py +0 -0
  146. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/__init__.py +0 -0
  147. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/email_comprehensive_test.py +0 -0
  148. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/email_test.py +0 -0
  149. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/html_test.py +0 -0
  150. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/image_test.py +0 -0
  151. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  152. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/pandoc_test.py +0 -0
  153. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/presentation_test.py +0 -0
  154. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/spreed_sheet_test.py +0 -0
  155. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/structured_test.py +0 -0
  156. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/gmft_extended_test.py +0 -0
  157. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/gmft_test.py +0 -0
  158. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/hooks_test.py +0 -0
  159. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/language_detection_test.py +0 -0
  160. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/mcp_server_test.py +0 -0
  161. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/mime_types_test.py +0 -0
  162. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/__init__.py +0 -0
  163. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  164. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  165. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/process_manager_test.py +0 -0
  166. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  167. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/__init__.py +0 -0
  168. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/base_test.py +0 -0
  169. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/device_integration_test.py +0 -0
  170. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/easyocr_test.py +0 -0
  171. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/init_test.py +0 -0
  172. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/paddleocr_test.py +0 -0
  173. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/tesseract_test.py +0 -0
  174. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/playa_test.py +0 -0
  175. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/registry_test.py +0 -0
  176. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/better-ocr-image.jpg +0 -0
  177. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/contract.txt +0 -0
  178. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/contract_test.txt +0 -0
  179. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/document.docx +0 -0
  180. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/email/sample-email.eml +0 -0
  181. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  182. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/excel.xlsx +0 -0
  183. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/form_test.txt +0 -0
  184. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/french-text.txt +0 -0
  185. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/german-text.txt +0 -0
  186. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/html.html +0 -0
  187. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  188. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/invoice_image.png +0 -0
  189. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/invoice_test.txt +0 -0
  190. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/json/sample-document.json +0 -0
  191. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  192. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/markdown.md +0 -0
  193. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  194. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/non-searchable.pdf +0 -0
  195. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/ocr-image.jpg +0 -0
  196. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  197. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  198. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  199. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  200. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/receipt_test.txt +0 -0
  201. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/report_test.txt +0 -0
  202. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/sample-contract.pdf +0 -0
  203. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/scanned.pdf +0 -0
  204. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/searchable.pdf +0 -0
  205. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/spanish-text.txt +0 -0
  206. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/test-article.pdf +0 -0
  207. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  208. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/types_test.py +0 -0
  209. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/__init__.py +0 -0
  210. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/cache_test.py +0 -0
  211. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/device_test.py +0 -0
  212. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/errors_test.py +0 -0
  213. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/pdf_lock_test.py +0 -0
  214. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/process_pool_test.py +0 -0
  215. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/serialization_test.py +0 -0
  216. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/string_test.py +0 -0
  217. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/sync_test.py +0 -0
  218. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/table_test.py +0 -0
  219. {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.9.1
3
+ Version: 3.10.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -53,6 +53,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
53
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
54
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
55
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
56
57
  Requires-Dist: rich>=14.1.0; extra == 'all'
57
58
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
59
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
@@ -69,6 +70,8 @@ Provides-Extra: cli
69
70
  Requires-Dist: click>=8.2.1; extra == 'cli'
70
71
  Requires-Dist: rich>=14.1.0; extra == 'cli'
71
72
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
+ Provides-Extra: crypto
74
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
72
75
  Provides-Extra: easyocr
73
76
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
74
77
  Provides-Extra: entity-extraction
@@ -193,16 +193,18 @@ rules:
193
193
  api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
194
194
  cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
195
195
  chunking = ["semantic-text-splitter>=0.27.0"]
196
+ crypto = ["playa-pdf[crypto]>=0.6.4"]
196
197
  easyocr = ["easyocr>=1.7.2"]
197
198
  gmft = ["gmft>=0.4.2"]
198
199
  langdetect = ["fast-langdetect>=0.2.0"]
199
200
  paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
200
- all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
201
+ all = ["kreuzberg[api,chunking,cli,crypto,easyocr,gmft,langdetect,paddleocr]"]
201
202
  ```
202
203
 
203
204
  ### Installation Patterns
204
205
  - Basic: `pip install kreuzberg`
205
206
  - With features: `pip install "kreuzberg[api,cli]"`
207
+ - With crypto support: `pip install "kreuzberg[crypto]"`
206
208
  - All features: `pip install "kreuzberg[all]"`
207
209
  - Development: `uv sync --all-extras`
208
210
 
@@ -211,6 +213,14 @@ rules:
211
213
  - **System**: tesseract-ocr, pandoc (via package manager)
212
214
  - **Development**: Uses dependency groups in pyproject.toml
213
215
 
216
+ ### Crypto Support
217
+ The `crypto` extra adds cryptographic support for PDF processing:
218
+ - **Purpose**: Enables AES encryption/decryption for password-protected PDFs
219
+ - **Dependencies**: Adds cryptography (~22MB), cffi, and pycparser
220
+ - **Usage**: Required for PDFs with AES encryption (RC4 is supported in base installation)
221
+ - **Password Support**: Supports single password or list of passwords to try in sequence
222
+ - **Size Impact**: Increases installation size by ~24MB due to cryptography package
223
+
214
224
  sections:
215
225
  - title: "Language Detection"
216
226
  content: |
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
22
22
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
23
  from kreuzberg._ocr._tesseract import TesseractConfig
24
24
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
25
- from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
26
26
  from kreuzberg._utils._errors import create_error_context, should_retry
27
27
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
28
28
  from kreuzberg._utils._string import normalize_spaces
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
33
33
 
34
34
  if TYPE_CHECKING: # pragma: no cover
35
35
  from PIL.Image import Image
36
+ from playa.document import Document
36
37
 
37
38
 
38
39
  class PDFExtractor(Extractor):
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
45
46
  file_path, unlink = await create_temp_file(".pdf")
46
47
  await AsyncPath(file_path).write_bytes(content)
47
48
  try:
48
- metadata = await extract_pdf_metadata(content)
49
+ metadata = await self._extract_metadata_with_password_attempts(content)
49
50
  result = await self.extract_path_async(file_path)
50
51
 
51
52
  result.metadata = metadata
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
73
74
  if not result:
74
75
  result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
75
76
 
76
- result.metadata = await extract_pdf_metadata(content_bytes)
77
+ result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
77
78
 
78
79
  if self.config.extract_tables:
79
80
  # GMFT is optional dependency
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
107
108
 
108
109
  result = self.extract_path_sync(Path(temp_path))
109
110
 
110
- metadata = extract_pdf_metadata_sync(content)
111
+ metadata = self._extract_metadata_with_password_attempts_sync(content)
111
112
  result.metadata = metadata
112
113
 
113
114
  return result
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
406
407
  # Use list comprehension and join for efficient string building
407
408
  return "\n\n".join(result.content for result in results)
408
409
 
410
+ def _parse_with_password_attempts(self, content: bytes) -> Document:
411
+ """Parse PDF with password attempts."""
412
+ # Normalize password to list
413
+ if isinstance(self.config.pdf_password, str):
414
+ passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
415
+ else:
416
+ passwords = list(self.config.pdf_password)
417
+
418
+ # Try each password in sequence
419
+ last_exception = None
420
+ for password in passwords:
421
+ try:
422
+ return parse(content, max_workers=1, password=password)
423
+ except Exception as e: # noqa: PERF203, BLE001
424
+ last_exception = e
425
+ continue
426
+
427
+ # If all passwords failed, raise the last exception
428
+ if last_exception:
429
+ raise last_exception from None
430
+
431
+ # Fallback to no password
432
+ return parse(content, max_workers=1, password="")
433
+
434
+ def _get_passwords_to_try(self) -> list[str]:
435
+ """Get list of passwords to try in sequence."""
436
+ if isinstance(self.config.pdf_password, str):
437
+ return [self.config.pdf_password] if self.config.pdf_password else [""]
438
+ return list(self.config.pdf_password) if self.config.pdf_password else [""]
439
+
440
+ async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
441
+ """Extract PDF metadata with password attempts."""
442
+ passwords = self._get_passwords_to_try()
443
+
444
+ last_exception = None
445
+ for password in passwords:
446
+ try:
447
+ return await extract_pdf_metadata(content, password=password)
448
+ except Exception as e: # noqa: PERF203, BLE001
449
+ last_exception = e
450
+ continue
451
+
452
+ # If all passwords failed, try with empty password as fallback
453
+ try:
454
+ return await extract_pdf_metadata(content, password="")
455
+ except Exception:
456
+ if last_exception:
457
+ raise last_exception from None
458
+ raise
459
+
460
+ def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
461
+ """Extract PDF metadata with password attempts (sync version)."""
462
+ passwords = self._get_passwords_to_try()
463
+
464
+ last_exception = None
465
+ for password in passwords:
466
+ try:
467
+ return extract_pdf_metadata_sync(content, password=password)
468
+ except Exception as e: # noqa: PERF203, BLE001
469
+ last_exception = e
470
+ continue
471
+
472
+ # If all passwords failed, try with empty password as fallback
473
+ try:
474
+ return extract_pdf_metadata_sync(content, password="")
475
+ except Exception:
476
+ if last_exception:
477
+ raise last_exception from None
478
+ raise
479
+
409
480
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
410
481
  """Extract text using playa for better structure preservation."""
411
482
  with contextlib.suppress(Exception):
412
483
  content = path.read_bytes()
413
- document = parse(content, max_workers=1)
484
+ document = self._parse_with_password_attempts(content)
414
485
 
415
486
  # Extract text while preserving structure
416
487
  pages_text = []
@@ -24,11 +24,12 @@ FULL_DATE_LENGTH = 14
24
24
  BOM_CHAR = "\ufeff"
25
25
 
26
26
 
27
- async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
27
+ async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
28
28
  """Extract metadata from a PDF document.
29
29
 
30
30
  Args:
31
31
  pdf_content: The bytes of the PDF document.
32
+ password: Password for encrypted PDF files.
32
33
 
33
34
  Raises:
34
35
  ParsingError: If the PDF metadata could not be extracted.
@@ -37,7 +38,7 @@ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
37
38
  A dictionary of metadata extracted from the PDF.
38
39
  """
39
40
  try:
40
- document = parse(pdf_content, max_workers=1)
41
+ document = parse(pdf_content, max_workers=1, password=password)
41
42
  metadata: Metadata = {}
42
43
 
43
44
  for raw_info in document.info:
@@ -275,13 +276,14 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
275
276
  result["subtitle"] = subtitle
276
277
 
277
278
 
278
- def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
279
+ def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
279
280
  """Synchronous version of extract_pdf_metadata.
280
281
 
281
282
  Extract metadata from a PDF document without using async/await.
282
283
 
283
284
  Args:
284
285
  pdf_content: The bytes of the PDF document.
286
+ password: Password for encrypted PDF files.
285
287
 
286
288
  Raises:
287
289
  ParsingError: If the PDF metadata could not be extracted.
@@ -290,7 +292,7 @@ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
290
292
  A dictionary of metadata extracted from the PDF.
291
293
  """
292
294
  try:
293
- document = parse(pdf_content, max_workers=1)
295
+ document = parse(pdf_content, max_workers=1, password=password)
294
296
  metadata: Metadata = {}
295
297
 
296
298
  for raw_info in document.info:
@@ -357,6 +357,8 @@ class ExtractionConfig:
357
357
  """The mode to use for document classification."""
358
358
  enable_quality_processing: bool = True
359
359
  """Whether to apply quality post-processing to improve extraction results."""
360
+ pdf_password: str | list[str] = ""
361
+ """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
360
362
 
361
363
  def __post_init__(self) -> None:
362
364
  if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
5
5
 
6
6
  [project]
7
7
  name = "kreuzberg"
8
- version = "3.9.1"
8
+ version = "3.10.0"
9
9
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -76,7 +76,7 @@ optional-dependencies.additional-extensions = [
76
76
  "tomli>=2.0.0; python_version<'3.11'",
77
77
  ]
78
78
  optional-dependencies.all = [
79
- "kreuzberg[additional-extensions,api,chunking,cli,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
79
+ "kreuzberg[additional-extensions,api,chunking,cli,crypto,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
80
80
  ]
81
81
  optional-dependencies.api = [
82
82
  "litestar[standard,structlog,opentelemetry]>=2.16.0",
@@ -91,6 +91,7 @@ optional-dependencies.cli = [
91
91
  "rich>=14.1.0",
92
92
  "tomli>=2.0.0; python_version<'3.11'",
93
93
  ]
94
+ optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.6.4" ]
94
95
  optional-dependencies.easyocr = [ "easyocr>=1.7.2" ]
95
96
  optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
96
97
  optional-dependencies.gmft = [ "gmft>=0.4.2" ]
@@ -388,3 +388,51 @@ async def test_extract_pdf_searchable_text_page_errors(
388
388
  return MockDocument()
389
389
 
390
390
  monkeypatch.setattr(pypdfium2, "PdfDocument", mock_pdf_document)
391
+
392
+
393
+ def test_pdf_password_configuration() -> None:
394
+ """Test PDF password configuration variations."""
395
+ # Test single password string
396
+ config = ExtractionConfig(pdf_password="test")
397
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
398
+ passwords = extractor._get_passwords_to_try()
399
+ assert passwords == ["test"]
400
+
401
+ # Test multiple passwords list
402
+ config = ExtractionConfig(pdf_password=["pass1", "pass2", "pass3"])
403
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
404
+ passwords = extractor._get_passwords_to_try()
405
+ assert passwords == ["pass1", "pass2", "pass3"]
406
+
407
+ # Test empty password string
408
+ config = ExtractionConfig(pdf_password="")
409
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
410
+ passwords = extractor._get_passwords_to_try()
411
+ assert passwords == [""]
412
+
413
+ # Test empty password list
414
+ config = ExtractionConfig(pdf_password=[])
415
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
416
+ passwords = extractor._get_passwords_to_try()
417
+ assert passwords == [""]
418
+
419
+
420
+ def test_pdf_password_attempts_with_parse_with_password_attempts(test_article: Path) -> None:
421
+ """Test the _parse_with_password_attempts method with different password configurations."""
422
+ # Test with no password (should work with regular PDF)
423
+ config = ExtractionConfig(pdf_password="")
424
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
425
+
426
+ content = test_article.read_bytes()
427
+ document = extractor._parse_with_password_attempts(content)
428
+
429
+ assert document is not None
430
+ assert len(document.pages) > 0
431
+
432
+ # Test with wrong password but fallback should work
433
+ config = ExtractionConfig(pdf_password="wrongpassword")
434
+ extractor = PDFExtractor(mime_type="application/pdf", config=config)
435
+
436
+ document = extractor._parse_with_password_attempts(content)
437
+ assert document is not None
438
+ assert len(document.pages) > 0
@@ -876,6 +876,53 @@ toml = [
876
876
  { name = "tomli", marker = "python_full_version <= '3.11'" },
877
877
  ]
878
878
 
879
+ [[package]]
880
+ name = "cryptography"
881
+ version = "45.0.5"
882
+ source = { registry = "https://pypi.org/simple" }
883
+ dependencies = [
884
+ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
885
+ ]
886
+ sdist = { url = "https://files.pythonhosted.org/packages/95/1e/49527ac611af559665f71cbb8f92b332b5ec9c6fbc4e88b0f8e92f5e85df/cryptography-45.0.5.tar.gz", hash = "sha256:72e76caa004ab63accdf26023fccd1d087f6d90ec6048ff33ad0445abf7f605a", size = 744903, upload-time = "2025-07-02T13:06:25.941Z" }
887
+ wheels = [
888
+ { url = "https://files.pythonhosted.org/packages/f0/fb/09e28bc0c46d2c547085e60897fea96310574c70fb21cd58a730a45f3403/cryptography-45.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:101ee65078f6dd3e5a028d4f19c07ffa4dd22cce6a20eaa160f8b5219911e7d8", size = 7043092, upload-time = "2025-07-02T13:05:01.514Z" },
889
+ { url = "https://files.pythonhosted.org/packages/b1/05/2194432935e29b91fb649f6149c1a4f9e6d3d9fc880919f4ad1bcc22641e/cryptography-45.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3a264aae5f7fbb089dbc01e0242d3b67dffe3e6292e1f5182122bdf58e65215d", size = 4205926, upload-time = "2025-07-02T13:05:04.741Z" },
890
+ { url = "https://files.pythonhosted.org/packages/07/8b/9ef5da82350175e32de245646b1884fc01124f53eb31164c77f95a08d682/cryptography-45.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e74d30ec9c7cb2f404af331d5b4099a9b322a8a6b25c4632755c8757345baac5", size = 4429235, upload-time = "2025-07-02T13:05:07.084Z" },
891
+ { url = "https://files.pythonhosted.org/packages/7c/e1/c809f398adde1994ee53438912192d92a1d0fc0f2d7582659d9ef4c28b0c/cryptography-45.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3af26738f2db354aafe492fb3869e955b12b2ef2e16908c8b9cb928128d42c57", size = 4209785, upload-time = "2025-07-02T13:05:09.321Z" },
892
+ { url = "https://files.pythonhosted.org/packages/d0/8b/07eb6bd5acff58406c5e806eff34a124936f41a4fb52909ffa4d00815f8c/cryptography-45.0.5-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e6c00130ed423201c5bc5544c23359141660b07999ad82e34e7bb8f882bb78e0", size = 3893050, upload-time = "2025-07-02T13:05:11.069Z" },
893
+ { url = "https://files.pythonhosted.org/packages/ec/ef/3333295ed58d900a13c92806b67e62f27876845a9a908c939f040887cca9/cryptography-45.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:dd420e577921c8c2d31289536c386aaa30140b473835e97f83bc71ea9d2baf2d", size = 4457379, upload-time = "2025-07-02T13:05:13.32Z" },
894
+ { url = "https://files.pythonhosted.org/packages/d9/9d/44080674dee514dbb82b21d6fa5d1055368f208304e2ab1828d85c9de8f4/cryptography-45.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:d05a38884db2ba215218745f0781775806bde4f32e07b135348355fe8e4991d9", size = 4209355, upload-time = "2025-07-02T13:05:15.017Z" },
895
+ { url = "https://files.pythonhosted.org/packages/c9/d8/0749f7d39f53f8258e5c18a93131919ac465ee1f9dccaf1b3f420235e0b5/cryptography-45.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:ad0caded895a00261a5b4aa9af828baede54638754b51955a0ac75576b831b27", size = 4456087, upload-time = "2025-07-02T13:05:16.945Z" },
896
+ { url = "https://files.pythonhosted.org/packages/09/d7/92acac187387bf08902b0bf0699816f08553927bdd6ba3654da0010289b4/cryptography-45.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9024beb59aca9d31d36fcdc1604dd9bbeed0a55bface9f1908df19178e2f116e", size = 4332873, upload-time = "2025-07-02T13:05:18.743Z" },
897
+ { url = "https://files.pythonhosted.org/packages/03/c2/840e0710da5106a7c3d4153c7215b2736151bba60bf4491bdb421df5056d/cryptography-45.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:91098f02ca81579c85f66df8a588c78f331ca19089763d733e34ad359f474174", size = 4564651, upload-time = "2025-07-02T13:05:21.382Z" },
898
+ { url = "https://files.pythonhosted.org/packages/2e/92/cc723dd6d71e9747a887b94eb3827825c6c24b9e6ce2bb33b847d31d5eaa/cryptography-45.0.5-cp311-abi3-win32.whl", hash = "sha256:926c3ea71a6043921050eaa639137e13dbe7b4ab25800932a8498364fc1abec9", size = 2929050, upload-time = "2025-07-02T13:05:23.39Z" },
899
+ { url = "https://files.pythonhosted.org/packages/1f/10/197da38a5911a48dd5389c043de4aec4b3c94cb836299b01253940788d78/cryptography-45.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:b85980d1e345fe769cfc57c57db2b59cff5464ee0c045d52c0df087e926fbe63", size = 3403224, upload-time = "2025-07-02T13:05:25.202Z" },
900
+ { url = "https://files.pythonhosted.org/packages/fe/2b/160ce8c2765e7a481ce57d55eba1546148583e7b6f85514472b1d151711d/cryptography-45.0.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f3562c2f23c612f2e4a6964a61d942f891d29ee320edb62ff48ffb99f3de9ae8", size = 7017143, upload-time = "2025-07-02T13:05:27.229Z" },
901
+ { url = "https://files.pythonhosted.org/packages/c2/e7/2187be2f871c0221a81f55ee3105d3cf3e273c0a0853651d7011eada0d7e/cryptography-45.0.5-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3fcfbefc4a7f332dece7272a88e410f611e79458fab97b5efe14e54fe476f4fd", size = 4197780, upload-time = "2025-07-02T13:05:29.299Z" },
902
+ { url = "https://files.pythonhosted.org/packages/b9/cf/84210c447c06104e6be9122661159ad4ce7a8190011669afceeaea150524/cryptography-45.0.5-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:460f8c39ba66af7db0545a8c6f2eabcbc5a5528fc1cf6c3fa9a1e44cec33385e", size = 4420091, upload-time = "2025-07-02T13:05:31.221Z" },
903
+ { url = "https://files.pythonhosted.org/packages/3e/6a/cb8b5c8bb82fafffa23aeff8d3a39822593cee6e2f16c5ca5c2ecca344f7/cryptography-45.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9b4cf6318915dccfe218e69bbec417fdd7c7185aa7aab139a2c0beb7468c89f0", size = 4198711, upload-time = "2025-07-02T13:05:33.062Z" },
904
+ { url = "https://files.pythonhosted.org/packages/04/f7/36d2d69df69c94cbb2473871926daf0f01ad8e00fe3986ac3c1e8c4ca4b3/cryptography-45.0.5-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2089cc8f70a6e454601525e5bf2779e665d7865af002a5dec8d14e561002e135", size = 3883299, upload-time = "2025-07-02T13:05:34.94Z" },
905
+ { url = "https://files.pythonhosted.org/packages/82/c7/f0ea40f016de72f81288e9fe8d1f6748036cb5ba6118774317a3ffc6022d/cryptography-45.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0027d566d65a38497bc37e0dd7c2f8ceda73597d2ac9ba93810204f56f52ebc7", size = 4450558, upload-time = "2025-07-02T13:05:37.288Z" },
906
+ { url = "https://files.pythonhosted.org/packages/06/ae/94b504dc1a3cdf642d710407c62e86296f7da9e66f27ab12a1ee6fdf005b/cryptography-45.0.5-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:be97d3a19c16a9be00edf79dca949c8fa7eff621763666a145f9f9535a5d7f42", size = 4198020, upload-time = "2025-07-02T13:05:39.102Z" },
907
+ { url = "https://files.pythonhosted.org/packages/05/2b/aaf0adb845d5dabb43480f18f7ca72e94f92c280aa983ddbd0bcd6ecd037/cryptography-45.0.5-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:7760c1c2e1a7084153a0f68fab76e754083b126a47d0117c9ed15e69e2103492", size = 4449759, upload-time = "2025-07-02T13:05:41.398Z" },
908
+ { url = "https://files.pythonhosted.org/packages/91/e4/f17e02066de63e0100a3a01b56f8f1016973a1d67551beaf585157a86b3f/cryptography-45.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6ff8728d8d890b3dda5765276d1bc6fb099252915a2cd3aff960c4c195745dd0", size = 4319991, upload-time = "2025-07-02T13:05:43.64Z" },
909
+ { url = "https://files.pythonhosted.org/packages/f2/2e/e2dbd629481b499b14516eed933f3276eb3239f7cee2dcfa4ee6b44d4711/cryptography-45.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7259038202a47fdecee7e62e0fd0b0738b6daa335354396c6ddebdbe1206af2a", size = 4554189, upload-time = "2025-07-02T13:05:46.045Z" },
910
+ { url = "https://files.pythonhosted.org/packages/f8/ea/a78a0c38f4c8736287b71c2ea3799d173d5ce778c7d6e3c163a95a05ad2a/cryptography-45.0.5-cp37-abi3-win32.whl", hash = "sha256:1e1da5accc0c750056c556a93c3e9cb828970206c68867712ca5805e46dc806f", size = 2911769, upload-time = "2025-07-02T13:05:48.329Z" },
911
+ { url = "https://files.pythonhosted.org/packages/79/b3/28ac139109d9005ad3f6b6f8976ffede6706a6478e21c889ce36c840918e/cryptography-45.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:90cb0a7bb35959f37e23303b7eed0a32280510030daba3f7fdfbb65defde6a97", size = 3390016, upload-time = "2025-07-02T13:05:50.811Z" },
912
+ { url = "https://files.pythonhosted.org/packages/f8/8b/34394337abe4566848a2bd49b26bcd4b07fd466afd3e8cce4cb79a390869/cryptography-45.0.5-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:206210d03c1193f4e1ff681d22885181d47efa1ab3018766a7b32a7b3d6e6afd", size = 3575762, upload-time = "2025-07-02T13:05:53.166Z" },
913
+ { url = "https://files.pythonhosted.org/packages/8b/5d/a19441c1e89afb0f173ac13178606ca6fab0d3bd3ebc29e9ed1318b507fc/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c648025b6840fe62e57107e0a25f604db740e728bd67da4f6f060f03017d5097", size = 4140906, upload-time = "2025-07-02T13:05:55.914Z" },
914
+ { url = "https://files.pythonhosted.org/packages/4b/db/daceb259982a3c2da4e619f45b5bfdec0e922a23de213b2636e78ef0919b/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b8fa8b0a35a9982a3c60ec79905ba5bb090fc0b9addcfd3dc2dd04267e45f25e", size = 4374411, upload-time = "2025-07-02T13:05:57.814Z" },
915
+ { url = "https://files.pythonhosted.org/packages/6a/35/5d06ad06402fc522c8bf7eab73422d05e789b4e38fe3206a85e3d6966c11/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:14d96584701a887763384f3c47f0ca7c1cce322aa1c31172680eb596b890ec30", size = 4140942, upload-time = "2025-07-02T13:06:00.137Z" },
916
+ { url = "https://files.pythonhosted.org/packages/65/79/020a5413347e44c382ef1f7f7e7a66817cd6273e3e6b5a72d18177b08b2f/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:57c816dfbd1659a367831baca4b775b2a5b43c003daf52e9d57e1d30bc2e1b0e", size = 4374079, upload-time = "2025-07-02T13:06:02.043Z" },
917
+ { url = "https://files.pythonhosted.org/packages/9b/c5/c0e07d84a9a2a8a0ed4f865e58f37c71af3eab7d5e094ff1b21f3f3af3bc/cryptography-45.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b9e38e0a83cd51e07f5a48ff9691cae95a79bea28fe4ded168a8e5c6c77e819d", size = 3321362, upload-time = "2025-07-02T13:06:04.463Z" },
918
+ { url = "https://files.pythonhosted.org/packages/c0/71/9bdbcfd58d6ff5084687fe722c58ac718ebedbc98b9f8f93781354e6d286/cryptography-45.0.5-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8c4a6ff8a30e9e3d38ac0539e9a9e02540ab3f827a3394f8852432f6b0ea152e", size = 3587878, upload-time = "2025-07-02T13:06:06.339Z" },
919
+ { url = "https://files.pythonhosted.org/packages/f0/63/83516cfb87f4a8756eaa4203f93b283fda23d210fc14e1e594bd5f20edb6/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bd4c45986472694e5121084c6ebbd112aa919a25e783b87eb95953c9573906d6", size = 4152447, upload-time = "2025-07-02T13:06:08.345Z" },
920
+ { url = "https://files.pythonhosted.org/packages/22/11/d2823d2a5a0bd5802b3565437add16f5c8ce1f0778bf3822f89ad2740a38/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:982518cd64c54fcada9d7e5cf28eabd3ee76bd03ab18e08a48cad7e8b6f31b18", size = 4386778, upload-time = "2025-07-02T13:06:10.263Z" },
921
+ { url = "https://files.pythonhosted.org/packages/5f/38/6bf177ca6bce4fe14704ab3e93627c5b0ca05242261a2e43ef3168472540/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:12e55281d993a793b0e883066f590c1ae1e802e3acb67f8b442e721e475e6463", size = 4151627, upload-time = "2025-07-02T13:06:13.097Z" },
922
+ { url = "https://files.pythonhosted.org/packages/38/6a/69fc67e5266bff68a91bcb81dff8fb0aba4d79a78521a08812048913e16f/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:5aa1e32983d4443e310f726ee4b071ab7569f58eedfdd65e9675484a4eb67bd1", size = 4385593, upload-time = "2025-07-02T13:06:15.689Z" },
923
+ { url = "https://files.pythonhosted.org/packages/f6/34/31a1604c9a9ade0fdab61eb48570e09a796f4d9836121266447b0eaf7feb/cryptography-45.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:e357286c1b76403dd384d938f93c46b2b058ed4dfcdce64a770f0537ed3feb6f", size = 3331106, upload-time = "2025-07-02T13:06:18.058Z" },
924
+ ]
925
+
879
926
  [[package]]
880
927
  name = "csscompressor"
881
928
  version = "0.9.5"
@@ -1079,7 +1126,7 @@ name = "exceptiongroup"
1079
1126
  version = "1.3.0"
1080
1127
  source = { registry = "https://pypi.org/simple" }
1081
1128
  dependencies = [
1082
- { name = "typing-extensions", marker = "python_full_version < '3.13'" },
1129
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
1083
1130
  ]
1084
1131
  sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
1085
1132
  wheels = [
@@ -1965,7 +2012,7 @@ wheels = [
1965
2012
 
1966
2013
  [[package]]
1967
2014
  name = "kreuzberg"
1968
- version = "3.9.0"
2015
+ version = "3.10.0"
1969
2016
  source = { editable = "." }
1970
2017
  dependencies = [
1971
2018
  { name = "anyio" },
@@ -1997,6 +2044,7 @@ all = [
1997
2044
  { name = "mailparse" },
1998
2045
  { name = "paddleocr" },
1999
2046
  { name = "paddlepaddle" },
2047
+ { name = "playa-pdf", extra = ["crypto"] },
2000
2048
  { name = "rich" },
2001
2049
  { name = "semantic-text-splitter" },
2002
2050
  { name = "setuptools" },
@@ -2018,6 +2066,9 @@ cli = [
2018
2066
  { name = "rich" },
2019
2067
  { name = "tomli", marker = "python_full_version < '3.11'" },
2020
2068
  ]
2069
+ crypto = [
2070
+ { name = "playa-pdf", extra = ["crypto"] },
2071
+ ]
2021
2072
  easyocr = [
2022
2073
  { name = "easyocr" },
2023
2074
  ]
@@ -2070,7 +2121,7 @@ requires-dist = [
2070
2121
  { name = "gmft", marker = "extra == 'gmft'", specifier = ">=0.4.2" },
2071
2122
  { name = "html-to-markdown", extras = ["lxml"], specifier = ">=1.9.0" },
2072
2123
  { name = "keybert", marker = "extra == 'entity-extraction'", specifier = ">=0.9.0" },
2073
- { name = "kreuzberg", extras = ["additional-extensions", "api", "chunking", "cli", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"], marker = "extra == 'all'" },
2124
+ { name = "kreuzberg", extras = ["additional-extensions", "api", "chunking", "cli", "crypto", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"], marker = "extra == 'all'" },
2074
2125
  { name = "litestar", extras = ["standard", "structlog", "opentelemetry"], marker = "extra == 'api'", specifier = ">=2.16.0" },
2075
2126
  { name = "mailparse", marker = "extra == 'additional-extensions'", specifier = ">=1.0.15" },
2076
2127
  { name = "mcp", specifier = ">=1.12.2" },
@@ -2079,6 +2130,7 @@ requires-dist = [
2079
2130
  { name = "paddlepaddle", marker = "extra == 'paddleocr'", specifier = ">=3.1.0" },
2080
2131
  { name = "pandas", marker = "extra == 'auto-classify-document-type'", specifier = ">=2.3.1" },
2081
2132
  { name = "playa-pdf", specifier = ">=0.6.4" },
2133
+ { name = "playa-pdf", extras = ["crypto"], marker = "extra == 'crypto'", specifier = ">=0.6.4" },
2082
2134
  { name = "psutil", specifier = ">=7.0.0" },
2083
2135
  { name = "pypdfium2", specifier = "==4.30.0" },
2084
2136
  { name = "python-calamine", specifier = ">=0.3.2" },
@@ -2091,7 +2143,7 @@ requires-dist = [
2091
2143
  { name = "tomli", marker = "python_full_version < '3.11' and extra == 'cli'", specifier = ">=2.0.0" },
2092
2144
  { name = "typing-extensions", marker = "python_full_version < '3.12'", specifier = ">=4.14.0" },
2093
2145
  ]
2094
- provides-extras = ["additional-extensions", "all", "api", "auto-classify-document-type", "chunking", "cli", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"]
2146
+ provides-extras = ["additional-extensions", "all", "api", "auto-classify-document-type", "chunking", "cli", "crypto", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"]
2095
2147
 
2096
2148
  [package.metadata.requires-dev]
2097
2149
  dev = [
@@ -3988,6 +4040,11 @@ wheels = [
3988
4040
  { url = "https://files.pythonhosted.org/packages/42/66/5362cccdabd6b425cbf6cf2e115560255066427d52a45c36891e63f7be97/playa_pdf-0.6.4-py3-none-any.whl", hash = "sha256:d8ff856bab8be784fd39ad83bbd7bdd09db382b1cb6923d14a0cb03e0e6841f0", size = 5661468, upload-time = "2025-07-26T16:09:33.04Z" },
3989
4041
  ]
3990
4042
 
4043
+ [package.optional-dependencies]
4044
+ crypto = [
4045
+ { name = "cryptography" },
4046
+ ]
4047
+
3991
4048
  [[package]]
3992
4049
  name = "pluggy"
3993
4050
  version = "1.6.0"
@@ -6081,7 +6138,7 @@ wheels = [
6081
6138
 
6082
6139
  [[package]]
6083
6140
  name = "transformers"
6084
- version = "4.54.0"
6141
+ version = "4.54.1"
6085
6142
  source = { registry = "https://pypi.org/simple" }
6086
6143
  dependencies = [
6087
6144
  { name = "filelock" },
@@ -6096,9 +6153,9 @@ dependencies = [
6096
6153
  { name = "tokenizers" },
6097
6154
  { name = "tqdm" },
6098
6155
  ]
6099
- sdist = { url = "https://files.pythonhosted.org/packages/fb/4b/3341d2fade52634d877476f4ed5fa8f7bf3f1e867bfba76f0fb341e2885f/transformers-4.54.0.tar.gz", hash = "sha256:843da4d66a573cef3d1b2e7a1d767e77da054621e69d9f3faff761e55a1f8203", size = 9510412, upload-time = "2025-07-25T18:58:20.826Z" }
6156
+ sdist = { url = "https://files.pythonhosted.org/packages/21/6c/4caeb57926f91d943f309b062e22ad1eb24a9f530421c5a65c1d89378a7a/transformers-4.54.1.tar.gz", hash = "sha256:b2551bb97903f13bd90c9467d0a144d41ca4d142defc044a99502bb77c5c1052", size = 9514288, upload-time = "2025-07-29T15:57:22.826Z" }
6100
6157
  wheels = [
6101
- { url = "https://files.pythonhosted.org/packages/cc/34/4d82dc596764de9d14285f8ed53b50896bf05fbbcd71a82c6d174b3ab8c7/transformers-4.54.0-py3-none-any.whl", hash = "sha256:c96e607f848625965b76c677b2c2576f2c7b7097c1c5292b281919d90675a25e", size = 11176597, upload-time = "2025-07-25T18:58:17.677Z" },
6158
+ { url = "https://files.pythonhosted.org/packages/cf/18/eb7578f84ef5a080d4e5ca9bc4f7c68e7aa9c1e464f1b3d3001e4c642fce/transformers-4.54.1-py3-none-any.whl", hash = "sha256:c89965a4f62a0d07009d45927a9c6372848a02ab9ead9c318c3d082708bab529", size = 11176397, upload-time = "2025-07-29T15:57:19.692Z" },
6102
6159
  ]
6103
6160
 
6104
6161
  [[package]]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes