kreuzberg 3.5.0__tar.gz → 3.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/publish-docker.yml +10 -21
  2. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/release.yaml +6 -0
  3. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/PKG-INFO +11 -5
  4. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/ai-rulez.yaml +25 -9
  5. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/types.md +18 -0
  6. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/examples/extraction-examples.md +77 -0
  7. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/installation.md +25 -1
  8. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/extraction-configuration.md +128 -0
  9. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/__init__.py +4 -1
  10. kreuzberg-3.6.1/kreuzberg/_entity_extraction.py +239 -0
  11. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_types.py +35 -0
  12. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/extraction.py +39 -22
  13. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/pyproject.toml +13 -10
  14. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/cli_integration_test.py +1 -1
  15. kreuzberg-3.6.1/tests/entity_extraction_test.py +102 -0
  16. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extraction_test.py +0 -1
  17. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/pdf_test.py +0 -1
  18. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/uv.lock +401 -67
  19. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.commitlintrc +0 -0
  20. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.docker/Dockerfile +0 -0
  21. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.docker/README.md +0 -0
  22. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.dockerignore +0 -0
  23. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/dependabot.yaml +0 -0
  24. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/ci.yaml +0 -0
  25. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/docs.yml +0 -0
  26. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/pr-title.yaml +0 -0
  27. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.gitignore +0 -0
  28. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.gitmodules +0 -0
  29. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.markdownlint.yaml +0 -0
  30. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.pre-commit-config.yaml +0 -0
  31. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/LICENSE +0 -0
  32. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/README.md +0 -0
  33. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/README.md +0 -0
  34. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/benchmark_baseline.py +0 -0
  35. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/end_to_end_benchmark.py +0 -0
  36. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/final_benchmark.py +0 -0
  37. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/pyproject.toml +0 -0
  38. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/baseline_results.json +0 -0
  39. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  40. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  41. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/final_benchmark_results.json +0 -0
  42. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/mime_caching_results.json +0 -0
  43. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  44. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/ocr_caching_results.json +0 -0
  45. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  46. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  47. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/table_caching_results.json +0 -0
  48. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/serialization_benchmark.py +0 -0
  49. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  50. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  51. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  52. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  53. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  54. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  55. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  56. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/statistical_benchmark.py +0 -0
  57. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/custom-extractors.md +0 -0
  58. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/custom-hooks.md +0 -0
  59. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/error-handling.md +0 -0
  60. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/index.md +0 -0
  61. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/performance.md +0 -0
  62. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/exceptions.md +0 -0
  63. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/extraction-functions.md +0 -0
  64. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/extractor-registry.md +0 -0
  65. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/index.md +0 -0
  66. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/ocr-configuration.md +0 -0
  67. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/assets/favicon.png +0 -0
  68. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/assets/logo.png +0 -0
  69. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/changelog.md +0 -0
  70. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/cli.md +0 -0
  71. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/contributing.md +0 -0
  72. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/css/extra.css +0 -0
  73. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/examples/index.md +0 -0
  74. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/index.md +0 -0
  75. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/quick-start.md +0 -0
  76. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/index.md +0 -0
  77. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/api-server.md +0 -0
  78. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/basic-usage.md +0 -0
  79. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/chunking.md +0 -0
  80. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/docker.md +0 -0
  81. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/index.md +0 -0
  82. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/metadata-extraction.md +0 -0
  83. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/ocr-backends.md +0 -0
  84. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/ocr-configuration.md +0 -0
  85. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/supported-formats.md +0 -0
  86. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/__main__.py +0 -0
  87. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_api/__init__.py +0 -0
  88. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_api/main.py +0 -0
  89. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_chunker.py +0 -0
  90. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_cli_config.py +0 -0
  91. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_constants.py +0 -0
  92. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/__init__.py +0 -0
  93. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_base.py +0 -0
  94. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_html.py +0 -0
  95. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_image.py +0 -0
  96. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  97. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_pdf.py +0 -0
  98. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_presentation.py +0 -0
  99. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  100. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_gmft.py +0 -0
  101. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_language_detection.py +0 -0
  102. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_mime_types.py +0 -0
  103. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/__init__.py +0 -0
  104. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
  105. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/process_manager.py +0 -0
  106. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/sync_easyocr.py +0 -0
  107. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -0
  108. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
  109. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
  110. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/__init__.py +0 -0
  111. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_base.py +0 -0
  112. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_easyocr.py +0 -0
  113. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
  114. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_tesseract.py +0 -0
  115. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_playa.py +0 -0
  116. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_registry.py +0 -0
  117. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/__init__.py +0 -0
  118. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_cache.py +0 -0
  119. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_device.py +0 -0
  120. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_document_cache.py +0 -0
  121. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_errors.py +0 -0
  122. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  123. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_process_pool.py +0 -0
  124. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_serialization.py +0 -0
  125. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_string.py +0 -0
  126. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_sync.py +0 -0
  127. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_tmp.py +0 -0
  128. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/cli.py +0 -0
  129. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/exceptions.py +0 -0
  130. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/py.typed +0 -0
  131. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/mkdocs.yaml +0 -0
  132. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/__init__.py +0 -0
  133. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/api/__init__.py +0 -0
  134. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/api/main_test.py +0 -0
  135. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/chunker_test.py +0 -0
  136. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/cli_test.py +0 -0
  137. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/conftest.py +0 -0
  138. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/exceptions_test.py +0 -0
  139. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extraction_batch_test.py +0 -0
  140. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/__init__.py +0 -0
  141. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/html_test.py +0 -0
  142. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/image_test.py +0 -0
  143. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  144. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/pandoc_test.py +0 -0
  145. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/presentation_test.py +0 -0
  146. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/spreed_sheet_test.py +0 -0
  147. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/gmft_extended_test.py +0 -0
  148. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/gmft_test.py +0 -0
  149. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/hooks_test.py +0 -0
  150. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/language_detection_test.py +0 -0
  151. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/mime_types_test.py +0 -0
  152. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/__init__.py +0 -0
  153. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
  154. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/process_manager_test.py +0 -0
  155. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/sync_tesseract_test.py +0 -0
  156. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  157. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/__init__.py +0 -0
  158. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/base_test.py +0 -0
  159. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/device_integration_test.py +0 -0
  160. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/easyocr_test.py +0 -0
  161. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/init_test.py +0 -0
  162. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/paddleocr_test.py +0 -0
  163. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/tesseract_test.py +0 -0
  164. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/playa_test.py +0 -0
  165. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/registry_test.py +0 -0
  166. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/document.docx +0 -0
  167. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  168. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/excel.xlsx +0 -0
  169. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/french-text.txt +0 -0
  170. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/german-text.txt +0 -0
  171. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/html.html +0 -0
  172. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/markdown.md +0 -0
  173. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  174. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/non-searchable.pdf +0 -0
  175. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/ocr-image.jpg +0 -0
  176. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  177. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  178. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  179. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  180. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/sample-contract.pdf +0 -0
  181. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/scanned.pdf +0 -0
  182. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/searchable.pdf +0 -0
  183. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/spanish-text.txt +0 -0
  184. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/test-article.pdf +0 -0
  185. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/types_test.py +0 -0
  186. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/__init__.py +0 -0
  187. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/cache_test.py +0 -0
  188. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/device_test.py +0 -0
  189. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/errors_test.py +0 -0
  190. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/pdf_lock_test.py +0 -0
  191. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/process_pool_test.py +0 -0
  192. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/serialization_test.py +0 -0
  193. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/string_test.py +0 -0
  194. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/sync_test.py +0 -0
  195. {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/tmp_test.py +0 -0
@@ -2,18 +2,14 @@
2
2
  name: Publish Docker Images
3
3
 
4
4
  on:
5
- workflow_run:
6
- workflows: ["Release"]
7
- types:
8
- - completed
9
- branches:
10
- - main
11
5
  workflow_dispatch:
6
+ release:
7
+ types: [published]
12
8
 
13
9
  jobs:
14
10
  build-and-push:
15
11
  runs-on: ubuntu-latest
16
- if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
12
+ if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
17
13
  permissions:
18
14
  contents: read
19
15
  packages: write
@@ -41,28 +37,21 @@ jobs:
41
37
  - name: Checkout repository
42
38
  uses: actions/checkout@v4
43
39
  with:
44
- ref: ${{ github.event.workflow_run.head_branch || github.ref }}
40
+ ref: ${{ github.ref }}
45
41
 
46
42
  - name: Get release version
47
43
  id: get_version
48
44
  run: |
49
- if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
50
- # For manual dispatch, get the latest tag by listing all tags
45
+ if [ "${{ github.event_name }}" = "release" ]; then
46
+ # For release events, use the release tag
47
+ VERSION="${{ github.event.release.tag_name }}"
48
+ else
49
+ # For workflow_dispatch, get the latest tag
51
50
  git fetch --tags
52
51
  VERSION=$(git tag --sort=-version:refname | head -n1)
53
- else
54
- # For workflow_run, use the head branch
55
- VERSION="${{ github.event.workflow_run.head_branch }}"
56
- # If triggered by a tag, extract version
57
- if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
58
- VERSION="$VERSION"
59
- else
60
- # Get the latest tag by listing all tags
61
- git fetch --tags
62
- VERSION=$(git tag --sort=-version:refname | head -n1)
63
- fi
64
52
  fi
65
53
  echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
54
+ echo "Using version: $VERSION"
66
55
 
67
56
  - name: Set up QEMU
68
57
  uses: docker/setup-qemu-action@v3
@@ -10,6 +10,7 @@ jobs:
10
10
  environment: pypi
11
11
  permissions:
12
12
  id-token: write
13
+ contents: read
13
14
  steps:
14
15
  - name: Checkout
15
16
  uses: actions/checkout@v4
@@ -29,3 +30,8 @@ jobs:
29
30
 
30
31
  - name: Publish
31
32
  uses: pypa/gh-action-pypi-publish@release/v1
33
+
34
+ - name: Docker Build Info
35
+ run: |
36
+ echo "Docker images will be built automatically by the publish-docker.yml workflow"
37
+ echo "triggered by this release event. No manual triggering needed."
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.5.0
3
+ Version: 3.6.1
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
7
  License: MIT
8
8
  License-File: LICENSE
9
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
+ Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
36
36
  Provides-Extra: all
37
37
  Requires-Dist: click>=8.2.1; extra == 'all'
38
38
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
39
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
39
40
  Requires-Dist: gmft>=0.4.2; extra == 'all'
40
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
41
+ Requires-Dist: keybert>=0.9.0; extra == 'all'
42
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
41
43
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
42
44
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
43
45
  Requires-Dist: rich>=14.0.0; extra == 'all'
44
46
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
45
47
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
48
+ Requires-Dist: spacy>=3.8.7; extra == 'all'
46
49
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
47
50
  Provides-Extra: api
48
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
51
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
49
52
  Provides-Extra: chunking
50
53
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
51
54
  Provides-Extra: cli
@@ -54,10 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
54
57
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
55
58
  Provides-Extra: easyocr
56
59
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
60
+ Provides-Extra: entity-extraction
61
+ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
62
+ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
57
63
  Provides-Extra: gmft
58
64
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
59
65
  Provides-Extra: langdetect
60
- Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
66
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
61
67
  Provides-Extra: paddleocr
62
68
  Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
63
69
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
@@ -1,6 +1,6 @@
1
1
  metadata:
2
2
  name: "Kreuzberg"
3
- version: "3.4.0"
3
+ version: "3.5.0"
4
4
  description: "A text extraction library supporting PDFs, images, office documents and more"
5
5
 
6
6
  outputs:
@@ -115,6 +115,7 @@ rules:
115
115
  - **OCR Backends**: Pluggable OCR engines with separate configuration classes
116
116
  - **GMFT Integration**: Table extraction using GMFT library for PDFs
117
117
  - **Chunking**: Text splitting functionality in `_chunker.py`
118
+ - **Language Detection**: Automatic language detection using fast-langdetect
118
119
  - **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
119
120
  - **API Server**: REST API using Litestar framework in `_api/main.py`
120
121
  - **CLI**: Command-line interface for batch processing and automation
@@ -144,6 +145,8 @@ rules:
144
145
  - Mock OCR responses for predictable testing
145
146
  - Both sync and async test variants
146
147
  - Comprehensive error case coverage
148
+ - OCR tests marked as `xfail` in CI environments for resilience
149
+ - Integration tests use timeouts and retry logic where appropriate
147
150
 
148
151
  - name: "Important Instructions"
149
152
  priority: 10
@@ -160,16 +163,17 @@ rules:
160
163
  priority: 6
161
164
  content: |
162
165
  ### GitHub Actions Workflows
163
- - **Release**: Automated PyPI publishing via GitHub releases
164
- - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
166
+ - **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
167
+ - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
165
168
  - **Documentation**: Auto-deploy to GitHub Pages on docs changes
169
+ - **CI**: Comprehensive testing across multiple Python versions and platforms
166
170
 
167
171
  ### Docker Variants
168
- - **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
169
- - **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
170
- - **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
171
- - **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
172
- - **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
172
+ - **Core** (`goldziher/kreuzberg:v3.5.0`): API + Tesseract OCR
173
+ - **EasyOCR** (`goldziher/kreuzberg:v3.5.0-easyocr`): Core + EasyOCR
174
+ - **PaddleOCR** (`goldziher/kreuzberg:v3.5.0-paddle`): Core + PaddleOCR
175
+ - **GMFT** (`goldziher/kreuzberg:v3.5.0-gmft`): Core + table extraction
176
+ - **All** (`goldziher/kreuzberg:v3.5.0-all`): All features included
173
177
 
174
178
  ### Manual Triggers
175
179
  - Docker builds: `gh workflow run "Publish Docker Images"`
@@ -191,8 +195,9 @@ rules:
191
195
  chunking = ["semantic-text-splitter>=0.27.0"]
192
196
  easyocr = ["easyocr>=1.7.2"]
193
197
  gmft = ["gmft>=0.4.2"]
198
+ langdetect = ["fast-langdetect>=0.2.0"]
194
199
  paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
195
- all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
200
+ all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
196
201
  ```
197
202
 
198
203
  ### Installation Patterns
@@ -207,6 +212,17 @@ rules:
207
212
  - **Development**: Uses dependency groups in pyproject.toml
208
213
 
209
214
  sections:
215
+ - title: "Language Detection"
216
+ content: |
217
+ ### Automatic Language Detection (v3.5.0+)
218
+ - **Feature**: Automatically detect languages in extracted text
219
+ - **Implementation**: Uses fast-langdetect library for high-performance detection
220
+ - **Configuration**:
221
+ - Enable with `auto_detect_language=True` in `ExtractionConfig`
222
+ - Configure via `LanguageDetectionConfig` for confidence thresholds
223
+ - **Output**: Results available in `ExtractionResult.detected_languages`
224
+ - **Integration**: Works with all extraction methods and file types
225
+
210
226
  - title: "Planned Features"
211
227
  content: |
212
228
  ### Structured Extraction (Issue #55)
@@ -40,10 +40,28 @@ Configuration options for the GMFT table extraction engine:
40
40
 
41
41
  ::: kreuzberg.GMFTConfig
42
42
 
43
+ ## Entity Extraction Configuration
44
+
45
+ Configuration options for spaCy-based entity extraction:
46
+
47
+ ::: kreuzberg.SpacyEntityExtractionConfig
48
+
49
+ ## Language Detection Configuration
50
+
51
+ Configuration options for automatic language detection:
52
+
53
+ ::: kreuzberg.LanguageDetectionConfig
54
+
43
55
  ## PSMMode (Page Segmentation Mode)
44
56
 
45
57
  ::: kreuzberg.PSMMode
46
58
 
59
+ ## Entity
60
+
61
+ Represents an extracted named entity:
62
+
63
+ ::: kreuzberg.Entity
64
+
47
65
  ## Metadata
48
66
 
49
67
  A TypedDict that contains optional metadata fields extracted from documents:
@@ -189,6 +189,83 @@ async def process_upload(file_content: bytes, mime_type: str):
189
189
  print(f"{key}: {value}")
190
190
  ```
191
191
 
192
+ ## Keywords
193
+
194
+ Kreuzberg supports keywords and regex extraction as follows:
195
+
196
+ ```python
197
+ from kreuzberg import ExtractionConfig, extract_file
198
+
199
+ async def extract_keywords():
200
+ config = ExtractionConfig(
201
+ extract_keywords=True,
202
+ keyword_count=5, # defaults to 10 if not set
203
+ )
204
+ result = await extract_file(
205
+ "document.pdf",
206
+ config=config,
207
+ )
208
+ print(f"Keywords: {result.keywords}")
209
+ ```
210
+
211
+ ## Entity and Keyword Extraction
212
+
213
+ Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
214
+
215
+ ```python
216
+ from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
217
+
218
+ async def extract_entities_and_keywords():
219
+ # Basic extraction
220
+ config = ExtractionConfig(
221
+ extract_entities=True,
222
+ extract_keywords=True,
223
+ keyword_count=5,
224
+ custom_entity_patterns={
225
+ "INVOICE_ID": r"INV-\d+",
226
+ "EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
227
+ },
228
+ )
229
+ result = await extract_file("document.pdf", config=config)
230
+
231
+ # Print extracted entities
232
+ if result.entities:
233
+ for entity in result.entities:
234
+ print(f"{entity.type}: {entity.text}")
235
+
236
+ # Print extracted keywords
237
+ if result.keywords:
238
+ for keyword, score in result.keywords:
239
+ print(f"Keyword: {keyword} (score: {score:.3f})")
240
+
241
+ async def extract_multilingual_entities():
242
+ # Configure spaCy for multiple languages
243
+ spacy_config = SpacyEntityExtractionConfig(
244
+ language_models={
245
+ "en": "en_core_web_sm",
246
+ "de": "de_core_news_sm",
247
+ "fr": "fr_core_news_sm",
248
+ },
249
+ fallback_to_multilingual=True,
250
+ )
251
+
252
+ config = ExtractionConfig(
253
+ auto_detect_language=True, # Automatically detect document languages
254
+ extract_entities=True,
255
+ spacy_entity_extraction_config=spacy_config,
256
+ )
257
+
258
+ result = await extract_file("multilingual_document.pdf", config=config)
259
+
260
+ if result.detected_languages:
261
+ print(f"Detected languages: {result.detected_languages}")
262
+
263
+ if result.entities:
264
+ print(f"Extracted {len(result.entities)} entities")
265
+ for entity in result.entities:
266
+ print(f" {entity.type}: {entity.text}")
267
+ ```
268
+
192
269
  ## Synchronous API
193
270
 
194
271
  For cases where async isn't needed or available:
@@ -110,6 +110,30 @@ Language detection is an optional feature that automatically detects the languag
110
110
  pip install "kreuzberg[langdetect]"
111
111
  ```
112
112
 
113
+ ### Entity and Keyword Extraction
114
+
115
+ Entity and keyword extraction are optional features that extract named entities and keywords from documents. Entity extraction uses [spaCy](https://spacy.io/) for multilingual named entity recognition, while keyword extraction uses [KeyBERT](https://github.com/MaartenGr/KeyBERT) for semantic keyword extraction:
116
+
117
+ ```shell
118
+ pip install "kreuzberg[entity-extraction]"
119
+ ```
120
+
121
+ After installation, you'll need to download the spaCy language models you plan to use:
122
+
123
+ ```shell
124
+ # Download English model (most common)
125
+ python -m spacy download en_core_web_sm
126
+
127
+ # Download other language models as needed
128
+ python -m spacy download de_core_news_sm # German
129
+ python -m spacy download fr_core_news_sm # French
130
+ python -m spacy download es_core_news_sm # Spanish
131
+ ```
132
+
133
+ !!! note "Language Model Requirements"
134
+
135
+ spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
136
+
113
137
  ### All Optional Dependencies
114
138
 
115
139
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -121,5 +145,5 @@ pip install "kreuzberg[all]"
121
145
  This is equivalent to:
122
146
 
123
147
  ```shell
124
- pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
148
+ pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
125
149
  ```
@@ -153,6 +153,134 @@ The feature requires the `langdetect` dependency:
153
153
  pip install "kreuzberg[langdetect]"
154
154
  ```
155
155
 
156
+ ### Entity and Keyword Extraction
157
+
158
+ Kreuzberg can extract named entities and keywords from documents using spaCy for entity recognition and KeyBERT for keyword extraction:
159
+
160
+ ```python
161
+ from kreuzberg import extract_file, ExtractionConfig, SpacyEntityExtractionConfig
162
+
163
+ # Basic entity and keyword extraction
164
+ result = await extract_file(
165
+ "document.pdf",
166
+ config=ExtractionConfig(
167
+ extract_entities=True,
168
+ extract_keywords=True,
169
+ keyword_count=10, # Number of keywords to extract (default: 10)
170
+ ),
171
+ )
172
+
173
+ # Access extracted entities and keywords
174
+ if result.entities:
175
+ for entity in result.entities:
176
+ print(f"{entity.type}: {entity.text} (position {entity.start}-{entity.end})")
177
+ # Example: "PERSON: John Doe (position 0-8)"
178
+
179
+ if result.keywords:
180
+ for keyword, score in result.keywords:
181
+ print(f"{keyword}: {score:.3f}")
182
+ # Example: "artificial intelligence: 0.845"
183
+ ```
184
+
185
+ #### Entity Extraction with Language Support
186
+
187
+ spaCy supports entity extraction in multiple languages. You can configure language-specific models:
188
+
189
+ ```python
190
+ from kreuzberg import extract_file, ExtractionConfig, SpacyEntityExtractionConfig
191
+
192
+ # Configure spaCy for specific languages
193
+ spacy_config = SpacyEntityExtractionConfig(
194
+ language_models={
195
+ "en": "en_core_web_sm", # English
196
+ "de": "de_core_news_sm", # German
197
+ "fr": "fr_core_news_sm", # French
198
+ "es": "es_core_news_sm", # Spanish
199
+ },
200
+ model_cache_dir="/tmp/spacy_models", # Custom model cache directory
201
+ fallback_to_multilingual=True, # Use multilingual model if language-specific model fails
202
+ )
203
+
204
+ # Extract with language detection to automatically choose the right model
205
+ result = await extract_file(
206
+ "multilingual_document.pdf",
207
+ config=ExtractionConfig(
208
+ auto_detect_language=True, # Enable language detection
209
+ extract_entities=True,
210
+ spacy_entity_extraction_config=spacy_config,
211
+ ),
212
+ )
213
+
214
+ # The system will automatically use the appropriate spaCy model based on detected languages
215
+ if result.detected_languages and result.entities:
216
+ print(f"Detected languages: {result.detected_languages}")
217
+ print(f"Extracted {len(result.entities)} entities")
218
+ ```
219
+
220
+ #### Custom Entity Patterns
221
+
222
+ You can define custom entity patterns using regular expressions:
223
+
224
+ ```python
225
+ result = await extract_file(
226
+ "invoice.pdf",
227
+ config=ExtractionConfig(
228
+ extract_entities=True,
229
+ custom_entity_patterns={
230
+ "INVOICE_ID": r"INV-\d{4,}", # Invoice numbers
231
+ "PHONE": r"\+?\d{1,3}[-.\s]?\d{3,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4}", # Phone numbers
232
+ "EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", # Email addresses
233
+ },
234
+ ),
235
+ )
236
+
237
+ # Custom patterns are combined with spaCy's standard entity types
238
+ for entity in result.entities:
239
+ if entity.type in ["INVOICE_ID", "PHONE", "EMAIL"]:
240
+ print(f"Custom entity - {entity.type}: {entity.text}")
241
+ else:
242
+ print(f"Standard entity - {entity.type}: {entity.text}")
243
+ ```
244
+
245
+ #### Supported Entity Types
246
+
247
+ spaCy automatically detects these standard entity types:
248
+
249
+ - **PERSON**: People's names
250
+ - **ORG**: Organizations, companies, agencies
251
+ - **GPE**: Countries, cities, states (Geopolitical entities)
252
+ - **MONEY**: Monetary values
253
+ - **DATE**: Date expressions
254
+ - **TIME**: Time expressions
255
+ - **PERCENT**: Percentage values
256
+ - **CARDINAL**: Numerals that do not fall under another type
257
+
258
+ Language-specific models may support additional entity types relevant to that language.
259
+
260
+ #### spaCy Configuration Options
261
+
262
+ - `language_models`: Dict mapping language codes to spaCy model names
263
+ - `model_cache_dir`: Custom directory for caching spaCy models
264
+ - `fallback_to_multilingual`: Whether to use multilingual model (`xx_ent_wiki_sm`) as fallback
265
+ - `max_doc_length`: Maximum document length for spaCy processing (default: 1,000,000 characters)
266
+ - `batch_size`: Batch size for processing multiple texts (default: 1,000)
267
+
268
+ #### Installation Requirements
269
+
270
+ Entity and keyword extraction require additional dependencies:
271
+
272
+ ```shell
273
+ # For entity extraction with spaCy
274
+ pip install "kreuzberg[entity-extraction]"
275
+
276
+ # Install specific spaCy language models as needed
277
+ python -m spacy download en_core_web_sm # English
278
+ python -m spacy download de_core_news_sm # German
279
+ python -m spacy download fr_core_news_sm # French
280
+ ```
281
+
282
+ Available spaCy models include: `en_core_web_sm`, `de_core_news_sm`, `fr_core_news_sm`, `es_core_news_sm`, `pt_core_news_sm`, `it_core_news_sm`, `nl_core_news_sm`, `zh_core_web_sm`, `ja_core_news_sm`, `ko_core_news_sm`, `ru_core_news_sm`, and many others.
283
+
156
284
  ### Batch Processing
157
285
 
158
286
  ```python
@@ -1,5 +1,6 @@
1
1
  from importlib.metadata import version
2
2
 
3
+ from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
3
4
  from kreuzberg._gmft import GMFTConfig
4
5
  from kreuzberg._language_detection import LanguageDetectionConfig
5
6
  from kreuzberg._ocr._easyocr import EasyOCRConfig
@@ -8,7 +9,7 @@ from kreuzberg._ocr._tesseract import TesseractConfig
8
9
 
9
10
  from ._ocr._tesseract import PSMMode
10
11
  from ._registry import ExtractorRegistry
11
- from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
12
+ from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
12
13
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
13
14
  from .extraction import (
14
15
  batch_extract_bytes,
@@ -25,6 +26,7 @@ __version__ = version("kreuzberg")
25
26
 
26
27
  __all__ = [
27
28
  "EasyOCRConfig",
29
+ "Entity",
28
30
  "ExtractionConfig",
29
31
  "ExtractionResult",
30
32
  "ExtractorRegistry",
@@ -37,6 +39,7 @@ __all__ = [
37
39
  "PSMMode",
38
40
  "PaddleOCRConfig",
39
41
  "ParsingError",
42
+ "SpacyEntityExtractionConfig",
40
43
  "TableData",
41
44
  "TesseractConfig",
42
45
  "ValidationError",