kreuzberg 3.15.0__tar.gz → 3.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/ci.yaml +1 -1
  2. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.pre-commit-config.yaml +1 -1
  3. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/PKG-INFO +12 -11
  4. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/README.md +10 -9
  5. kreuzberg-3.16.0/Taskfile.yml +50 -0
  6. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/types.md +12 -0
  7. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/examples/extraction-examples.md +83 -1
  8. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/extraction-configuration.md +68 -1
  9. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/metadata-extraction.md +51 -0
  10. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/supported-formats.md +14 -1
  11. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/__init__.py +4 -0
  12. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_api/main.py +0 -53
  13. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_config.py +11 -1
  14. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_document_classification.py +1 -1
  15. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_email.py +16 -10
  16. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_html.py +39 -12
  17. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_pdf.py +2 -3
  18. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_presentation.py +4 -0
  19. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_spread_sheet.py +0 -1
  20. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_structured.py +83 -15
  21. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_gmft.py +5 -0
  22. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_mcp/server.py +0 -21
  23. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_easyocr.py +51 -19
  24. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_tesseract.py +14 -3
  25. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_types.py +111 -40
  26. kreuzberg-3.16.0/kreuzberg/_utils/_html_streaming.py +20 -0
  27. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_serialization.py +13 -6
  28. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_sync.py +15 -16
  29. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/extraction.py +2 -2
  30. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/pyproject.toml +3 -3
  31. kreuzberg-3.16.0/tests/api/config_cache_test.py +248 -0
  32. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/image_extraction_test.py +4 -1
  33. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/main_test.py +7 -7
  34. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/runtime_config_test.py +4 -1
  35. kreuzberg-3.16.0/tests/core/comprehensive_config_test.py +603 -0
  36. kreuzberg-3.16.0/tests/core/constants_test.py +22 -0
  37. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/dpi_configuration_test.py +19 -0
  38. kreuzberg-3.16.0/tests/core/exceptions_test.py +159 -0
  39. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/extraction_batch_test.py +8 -65
  40. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/extraction_test.py +75 -38
  41. kreuzberg-3.16.0/tests/core/init_test.py +85 -0
  42. kreuzberg-3.16.0/tests/core/main_test.py +35 -0
  43. kreuzberg-3.16.0/tests/core/mime_types_test.py +242 -0
  44. kreuzberg-3.16.0/tests/core/registry_test.py +225 -0
  45. kreuzberg-3.16.0/tests/core/types_test.py +403 -0
  46. kreuzberg-3.16.0/tests/extractors/base_extractor_test.py +420 -0
  47. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/base_ocr_processing_test.py +6 -18
  48. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/email_test.py +1 -1
  49. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_error_handling_test.py +5 -3
  50. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_test.py +2 -19
  51. kreuzberg-3.16.0/tests/extractors/json_test.py +427 -0
  52. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pandoc_test.py +27 -29
  53. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pdf_test.py +12 -7
  54. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/spreadsheet_test.py +17 -13
  55. kreuzberg-3.16.0/tests/features/chunker_test.py +94 -0
  56. kreuzberg-3.16.0/tests/features/document_classification_test.py +747 -0
  57. kreuzberg-3.16.0/tests/features/entity_extraction_test.py +348 -0
  58. kreuzberg-3.16.0/tests/features/gmft_test.py +1496 -0
  59. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/features/language_detection_test.py +6 -34
  60. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/all_extractors_images_test.py +45 -24
  61. kreuzberg-3.16.0/tests/interfaces/cli_test.py +527 -0
  62. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/interfaces/mcp_server_test.py +44 -203
  63. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/gmft_isolated_test.py +1 -0
  64. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/easyocr_test.py +6 -0
  65. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/paddleocr_test.py +1 -0
  66. kreuzberg-3.16.0/tests/test_source_files/json/complex_nested.json +41 -0
  67. kreuzberg-3.16.0/tests/test_source_files/json/real_world/aws_policy.json +43 -0
  68. kreuzberg-3.16.0/tests/test_source_files/json/real_world/earthquakes.geojson +6 -0
  69. kreuzberg-3.16.0/tests/test_source_files/json/real_world/github_emojis.json +111 -0
  70. kreuzberg-3.16.0/tests/test_source_files/json/real_world/iss_location.json +1 -0
  71. kreuzberg-3.16.0/tests/test_source_files/json/real_world/openapi_spec.json +84 -0
  72. kreuzberg-3.16.0/tests/test_source_files/json/real_world/package.json +33 -0
  73. kreuzberg-3.16.0/tests/test_source_files/json/real_world/rick_morty_character.json +1 -0
  74. kreuzberg-3.16.0/tests/test_source_files/json/schema_test.json +25 -0
  75. kreuzberg-3.16.0/tests/utils/playa_metadata_test.py +753 -0
  76. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/playa_test.py +68 -17
  77. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/process_pool_test.py +1 -1
  78. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/serialization_test.py +82 -0
  79. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/uv.lock +23 -23
  80. kreuzberg-3.15.0/Taskfile.yml +0 -161
  81. kreuzberg-3.15.0/tests/core/exceptions_test.py +0 -0
  82. kreuzberg-3.15.0/tests/core/mime_types_test.py +0 -0
  83. kreuzberg-3.15.0/tests/core/registry_test.py +0 -0
  84. kreuzberg-3.15.0/tests/core/types_test.py +0 -23
  85. kreuzberg-3.15.0/tests/features/chunker_test.py +0 -0
  86. kreuzberg-3.15.0/tests/features/document_classification_test.py +0 -0
  87. kreuzberg-3.15.0/tests/features/entity_extraction_test.py +0 -0
  88. kreuzberg-3.15.0/tests/features/gmft_test.py +0 -528
  89. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.commitlintrc +0 -0
  90. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.deepsource.toml +0 -0
  91. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.docker/Dockerfile +0 -0
  92. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.docker/README.md +0 -0
  93. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.dockerignore +0 -0
  94. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/dependabot.yaml +0 -0
  95. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/docker-e2e-tests.yml +0 -0
  96. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/docs.yml +0 -0
  97. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/pr-title.yaml +0 -0
  98. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/publish-docker.yml +0 -0
  99. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/release.yaml +0 -0
  100. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/test-docker-builds.yml +0 -0
  101. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.gitignore +0 -0
  102. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.markdownlint.yaml +0 -0
  103. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/LICENSE +0 -0
  104. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/ai-rulez.yaml +0 -0
  105. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/README.md +0 -0
  106. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/__init__.py +0 -0
  107. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/batch_size_benchmark.py +0 -0
  108. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/batch_validation_benchmark.py +0 -0
  109. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/py.typed +0 -0
  110. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/pyproject.toml +0 -0
  111. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/__init__.py +0 -0
  112. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/__main__.py +0 -0
  113. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/benchmarks.py +0 -0
  114. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/cli.py +0 -0
  115. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/models.py +0 -0
  116. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/profiler.py +0 -0
  117. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/runner.py +0 -0
  118. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docker-logs/docker-info.txt +0 -0
  119. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docker-logs/docker-version.txt +0 -0
  120. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/custom-extractors.md +0 -0
  121. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/custom-hooks.md +0 -0
  122. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/error-handling.md +0 -0
  123. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/index.md +0 -0
  124. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/performance.md +0 -0
  125. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/exceptions.md +0 -0
  126. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/extraction-functions.md +0 -0
  127. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/extractor-registry.md +0 -0
  128. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/index.md +0 -0
  129. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/ocr-configuration.md +0 -0
  130. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/assets/favicon.png +0 -0
  131. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/assets/logo.png +0 -0
  132. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/cli.md +0 -0
  133. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/contributing.md +0 -0
  134. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/css/extra.css +0 -0
  135. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/examples/index.md +0 -0
  136. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/getting-started/index.md +0 -0
  137. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/getting-started/installation.md +0 -0
  138. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/getting-started/quick-start.md +0 -0
  139. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/index.md +0 -0
  140. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/api-server.md +0 -0
  141. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/basic-usage.md +0 -0
  142. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/chunking.md +0 -0
  143. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/docker.md +0 -0
  144. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/document-classification.md +0 -0
  145. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/index.md +0 -0
  146. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/mcp-server.md +0 -0
  147. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/ocr-backends.md +0 -0
  148. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/ocr-configuration.md +0 -0
  149. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/__main__.py +0 -0
  150. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_api/__init__.py +0 -0
  151. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_api/_config_cache.py +0 -0
  152. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_chunker.py +0 -0
  153. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_constants.py +0 -0
  154. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_entity_extraction.py +0 -0
  155. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/__init__.py +0 -0
  156. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_base.py +0 -0
  157. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_image.py +0 -0
  158. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  159. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_language_detection.py +0 -0
  160. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_mcp/__init__.py +0 -0
  161. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_mime_types.py +0 -0
  162. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/__init__.py +0 -0
  163. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_base.py +0 -0
  164. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  165. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
  166. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_playa.py +0 -0
  167. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_registry.py +0 -0
  168. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/__init__.py +0 -0
  169. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_cache.py +0 -0
  170. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_device.py +0 -0
  171. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_document_cache.py +0 -0
  172. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_errors.py +0 -0
  173. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  174. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
  175. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  176. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_process_pool.py +0 -0
  177. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_quality.py +0 -0
  178. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_ref.py +0 -0
  179. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_resource_managers.py +0 -0
  180. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_string.py +0 -0
  181. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_table.py +0 -0
  182. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_tmp.py +0 -0
  183. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/cli.py +0 -0
  184. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/exceptions.py +0 -0
  185. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/py.typed +0 -0
  186. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/mkdocs.yaml +0 -0
  187. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/__init__.py +0 -0
  188. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/__init__.py +0 -0
  189. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/conftest.py +0 -0
  190. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/header_config_hashing_test.py +0 -0
  191. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/conftest.py +0 -0
  192. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/__init__.py +0 -0
  193. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/config_test.py +0 -0
  194. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/html_to_markdown_config_test.py +0 -0
  195. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/image_ocr_result_test.py +0 -0
  196. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/e2e/__init__.py +0 -0
  197. /kreuzberg-3.15.0/tests/e2e/docker_e2e_test.py → /kreuzberg-3.16.0/tests/e2e/docker_e2e.py +0 -0
  198. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/README_image_tests.md +0 -0
  199. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/__init__.py +0 -0
  200. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/base_memory_limits_test.py +0 -0
  201. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/base_ocr_simple_test.py +0 -0
  202. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/email_error_paths_test.py +0 -0
  203. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/html_invalid_base64_test.py +0 -0
  204. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/html_test.py +0 -0
  205. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_deduplication_test.py +0 -0
  206. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_error_simple_test.py +0 -0
  207. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  208. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pdf_images_test.py +0 -0
  209. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pdf_sync_images_test.py +0 -0
  210. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/presentation_test.py +0 -0
  211. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/structured_test.py +0 -0
  212. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/features/__init__.py +0 -0
  213. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/features/hooks_test.py +0 -0
  214. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/__init__.py +0 -0
  215. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/api/__init__.py +0 -0
  216. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/api/large_file_test.py +0 -0
  217. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/api/mounted_config_test.py +0 -0
  218. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/dpi_integration_test.py +0 -0
  219. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/multiprocessing/__init__.py +0 -0
  220. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  221. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/__init__.py +0 -0
  222. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/device_integration_test.py +0 -0
  223. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  224. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  225. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pandoc_images_test.py +0 -0
  226. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pdf_images_test.py +0 -0
  227. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pdf_real_images_test.py +0 -0
  228. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pptx_complex_test.py +0 -0
  229. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pptx_images_test.py +0 -0
  230. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/regression_test.py +0 -0
  231. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/interfaces/__init__.py +0 -0
  232. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/mcp/__init__.py +0 -0
  233. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/mcp/mcp_server_test.py +0 -0
  234. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/__init__.py +0 -0
  235. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/process_manager_test.py +0 -0
  236. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  237. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/__init__.py +0 -0
  238. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/base_test.py +0 -0
  239. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/init_test.py +0 -0
  240. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/tesseract_test.py +0 -0
  241. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/tesseract_tsv_test.py +0 -0
  242. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/performance/__init__.py +0 -0
  243. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/performance/large_pdf_perf_test.py +0 -0
  244. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  245. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/contract.txt +0 -0
  246. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/contract_test.txt +0 -0
  247. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/document.docx +0 -0
  248. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/email/sample-email.eml +0 -0
  249. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  250. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/excel.xlsx +0 -0
  251. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/flower-no-text.jpg +0 -0
  252. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/form_test.txt +0 -0
  253. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/french-text.txt +0 -0
  254. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/german-text.txt +0 -0
  255. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/google-doc-document.pdf +0 -0
  256. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/html.html +0 -0
  257. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  258. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/invoice_image.png +0 -0
  259. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/invoice_test.txt +0 -0
  260. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/json/sample-document.json +0 -0
  261. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  262. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/markdown.md +0 -0
  263. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  264. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/non-searchable.pdf +0 -0
  265. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/ocr-image.jpg +0 -0
  266. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  267. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  268. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  269. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  270. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/receipt_test.txt +0 -0
  271. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/report_test.txt +0 -0
  272. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/sample-contract.pdf +0 -0
  273. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/scanned.pdf +0 -0
  274. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/searchable.pdf +0 -0
  275. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
  276. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/spanish-text.txt +0 -0
  277. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/tables/borderless_table.png +0 -0
  278. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/tables/complex_document.png +0 -0
  279. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/tables/simple_table.png +0 -0
  280. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/test-article.pdf +0 -0
  281. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/test-excel.xls +0 -0
  282. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  283. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/__init__.py +0 -0
  284. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/cache_test.py +0 -0
  285. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/device_test.py +0 -0
  286. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/errors_test.py +0 -0
  287. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/ocr_cache_test.py +0 -0
  288. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/pdf_lock_test.py +0 -0
  289. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/playa_helpers_test.py +0 -0
  290. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/quality_test.py +0 -0
  291. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/ref_test.py +0 -0
  292. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/string_test.py +0 -0
  293. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/sync_test.py +0 -0
  294. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/table_test.py +0 -0
  295. {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/tmp_test.py +0 -0
@@ -212,7 +212,7 @@ jobs:
212
212
  uses: actions/checkout@v5
213
213
 
214
214
  - name: Download Coverage Artifacts
215
- uses: actions/download-artifact@v4
215
+ uses: actions/download-artifact@v5
216
216
  with:
217
217
  pattern: coverage-*-${{ github.sha }}
218
218
  merge-multiple: true
@@ -11,7 +11,7 @@ repos:
11
11
  - id: name-tests-test
12
12
  args:
13
13
  - --pytest
14
- exclude: factories|test_utils|completion.py|test_data
14
+ exclude: factories|test_utils|completion.py|test_data|docker_e2e.py
15
15
  - id: trailing-whitespace
16
16
  - id: end-of-file-fixer
17
17
  - id: check-toml
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.15.0
3
+ Version: 3.16.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.11.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.13.0
35
35
  Requires-Dist: mcp>=1.14.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: numpy>=2.0.0
@@ -109,7 +109,7 @@ Description-Content-Type: text/markdown
109
109
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
110
110
  - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
111
111
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
112
- - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
112
+ - **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
113
113
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
114
114
  - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
115
115
 
@@ -227,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
227
227
 
228
228
  ## Supported Formats
229
229
 
230
- | Category | Formats |
231
- | ----------------- | ------------------------------ |
232
- | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
233
- | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
234
- | **Spreadsheets** | XLSX, XLS, CSV, ODS |
235
- | **Presentations** | PPTX, PPT, ODP |
236
- | **Web** | HTML, XML, MHTML |
237
- | **Archives** | Support via extraction |
230
+ | Category | Formats |
231
+ | ------------------- | ------------------------------ |
232
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
233
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
234
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
235
+ | **Presentations** | PPTX, PPT, ODP |
236
+ | **Web** | HTML, XML, MHTML |
237
+ | **Structured Data** | JSON, YAML, TOML |
238
+ | **Archives** | Support via extraction |
238
239
 
239
240
  ## 📊 Performance Characteristics
240
241
 
@@ -18,7 +18,7 @@
18
18
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
19
19
  - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
20
20
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
21
- - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
21
+ - **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
22
22
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
23
23
  - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
24
24
 
@@ -136,14 +136,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
136
136
 
137
137
  ## Supported Formats
138
138
 
139
- | Category | Formats |
140
- | ----------------- | ------------------------------ |
141
- | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
142
- | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
143
- | **Spreadsheets** | XLSX, XLS, CSV, ODS |
144
- | **Presentations** | PPTX, PPT, ODP |
145
- | **Web** | HTML, XML, MHTML |
146
- | **Archives** | Support via extraction |
139
+ | Category | Formats |
140
+ | ------------------- | ------------------------------ |
141
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
142
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
143
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
144
+ | **Presentations** | PPTX, PPT, ODP |
145
+ | **Web** | HTML, XML, MHTML |
146
+ | **Structured Data** | JSON, YAML, TOML |
147
+ | **Archives** | Support via extraction |
147
148
 
148
149
  ## 📊 Performance Characteristics
149
150
 
@@ -0,0 +1,50 @@
1
+ version: "3"
2
+
3
+ env:
4
+ DOCKER_BUILDKIT: 1
5
+ BUILDKIT_PROGRESS: plain
6
+
7
+ tasks:
8
+ setup:
9
+ desc: "Install dependencies with uv"
10
+ cmds:
11
+ - uv sync --all-extras --all-packages
12
+ - pre-commit install && pre-commit install -hook-type commit-msg
13
+
14
+ update:
15
+ desc: "Update the dependencies"
16
+ cmds:
17
+ - uv run uv-bump
18
+ - cd benchmarks && uv run uv-bump && cd -
19
+ - uv sync --all-extras --all-packages --upgrade
20
+ - pre-commit autoupdate
21
+
22
+ test:
23
+ desc: "Run tests with pytest"
24
+ cmds:
25
+ - uv run pytest
26
+
27
+ test:cov:
28
+ desc: "Run tests with coverage"
29
+ cmds:
30
+ - uv run pytest --cov
31
+
32
+ lint:
33
+ desc: "Lint code with ruff and docs with markdownlint"
34
+ cmds:
35
+ - pre-commit run --all-files
36
+
37
+ docs:build:
38
+ desc: "Build documentation"
39
+ cmds:
40
+ - uv run mkdocs build --clean --strict
41
+
42
+ docs:serve:
43
+ desc: "Serve documentation locally"
44
+ cmds:
45
+ - uv run mkdocs serve
46
+
47
+ default:
48
+ desc: "Show available tasks"
49
+ cmds:
50
+ - task --list
@@ -72,6 +72,18 @@ Configuration options for automatic language detection:
72
72
 
73
73
  ::: kreuzberg.LanguageDetectionConfig
74
74
 
75
+ ## JSON Extraction Configuration
76
+
77
+ Configuration for enhanced JSON document processing:
78
+
79
+ ::: kreuzberg.JSONExtractionConfig
80
+
81
+ ## HTML to Markdown Configuration
82
+
83
+ Configuration options for converting HTML content to Markdown:
84
+
85
+ ::: kreuzberg.HTMLToMarkdownConfig
86
+
75
87
  ## PSMMode (Page Segmentation Mode)
76
88
 
77
89
  ::: kreuzberg.PSMMode
@@ -525,13 +525,95 @@ async def comprehensive_extraction():
525
525
  print(f"Total text (including OCR): {len(all_text)} characters")
526
526
  ```
527
527
 
528
+ ## JSON and Structured Data Extraction
529
+
530
+ ### Basic JSON Extraction
531
+
532
+ ```python
533
+ from kreuzberg import extract_file_sync
534
+
535
+ # Simple JSON extraction
536
+ result = extract_file_sync("data.json")
537
+ print(result.content)
538
+
539
+ # Metadata includes detected text fields
540
+ print(f"Title: {result.metadata.get('title')}")
541
+ print(f"Description: {result.metadata.get('description')}")
542
+ ```
543
+
544
+ ### Advanced JSON with Schema Extraction
545
+
546
+ ```python
547
+ from kreuzberg import extract_file_sync, ExtractionConfig, JSONExtractionConfig
548
+
549
+ # Configure advanced JSON extraction
550
+ json_config = JSONExtractionConfig(
551
+ extract_schema=True, # Extract JSON structure
552
+ custom_text_field_patterns=frozenset({"summary", "abstract"}), # Custom fields
553
+ include_type_info=True, # Add type annotations
554
+ flatten_nested_objects=True, # Flatten nested structures
555
+ max_depth=5, # Limit schema depth
556
+ array_item_limit=100, # Limit array processing
557
+ )
558
+
559
+ config = ExtractionConfig(json_config=json_config)
560
+ result = extract_file_sync("complex.json", config=config)
561
+
562
+ # Access schema information
563
+ if "json_schema" in result.metadata:
564
+ schema = result.metadata["json_schema"]
565
+ print(f"Root type: {schema['type']}")
566
+ print(f"Properties: {list(schema.get('properties', {}).keys())}")
567
+
568
+ # Access nested attributes with dotted notation
569
+ if "attributes" in result.metadata:
570
+ attrs = result.metadata["attributes"]
571
+ # Nested fields like {"info": {"title": "Example"}} become "info.title"
572
+ print(f"Nested title: {attrs.get('info.title')}")
573
+ ```
574
+
575
+ ### YAML and TOML Processing
576
+
577
+ ```python
578
+ from kreuzberg import extract_file_sync
579
+
580
+ # YAML extraction (similar to JSON)
581
+ yaml_result = extract_file_sync("config.yaml")
582
+ print(yaml_result.content)
583
+
584
+ # TOML extraction
585
+ toml_result = extract_file_sync("pyproject.toml")
586
+ print(toml_result.content)
587
+
588
+ # Both formats support the same metadata extraction as JSON
589
+ print(f"Package name: {toml_result.metadata.get('name')}")
590
+ ```
591
+
592
+ ### Working with API Responses
593
+
594
+ ```python
595
+ import httpx
596
+ from kreuzberg import extract_bytes_sync, ExtractionConfig, JSONExtractionConfig
597
+
598
+ # Fetch JSON from API
599
+ response = httpx.get("https://api.example.com/data")
600
+
601
+ # Extract with schema
602
+ config = ExtractionConfig(json_config=JSONExtractionConfig(extract_schema=True))
603
+
604
+ result = extract_bytes_sync(response.content, mime_type="application/json", config=config)
605
+
606
+ print(f"API Response: {result.content}")
607
+ print(f"Schema: {result.metadata.get('json_schema')}")
608
+ ```
609
+
528
610
  ## Batch Processing
529
611
 
530
612
  ```python
531
613
  from kreuzberg import batch_extract_file, ExtractionConfig
532
614
 
533
615
  async def process_documents():
534
- file_paths = ["document1.pdf", "document2.docx", "image.jpg"]
616
+ file_paths = ["document1.pdf", "document2.docx", "data.json", "image.jpg"]
535
617
  config = ExtractionConfig() # Optional: configure extraction options
536
618
  results = await batch_extract_file(file_paths, config=config)
537
619
 
@@ -94,6 +94,14 @@ strong_em_symbol = "_"
94
94
  escape_underscores = false
95
95
  wrap = true
96
96
  wrap_width = 100
97
+ list_indent_width = 2 # Use 2 spaces for Discord/Slack compatibility
98
+ list_indent_type = "spaces" # Use spaces instead of tabs
99
+ whitespace_mode = "normalized" # Handle whitespace intelligently
100
+ br_in_tables = false # Use spaces instead of <br> in tables
101
+ highlight_style = "double-equal" # Style for highlighted text
102
+ newline_style = "spaces" # Style for line breaks
103
+ preprocess_html = true # Clean messy HTML before conversion
104
+ preprocessing_preset = "standard" # Level of HTML cleaning
97
105
  ```
98
106
 
99
107
  ### pyproject.toml Example
@@ -623,6 +631,58 @@ For better performance in production:
623
631
  - Enable deduplication to avoid redundant processing
624
632
  - Use selective extraction based on document types
625
633
 
634
+ ### JSON Extraction Configuration
635
+
636
+ Kreuzberg provides enhanced JSON document processing with schema extraction and customizable field detection:
637
+
638
+ ```python
639
+ from kreuzberg import extract_file, ExtractionConfig, JSONExtractionConfig
640
+
641
+ # Advanced JSON extraction with schema
642
+ result = await extract_file(
643
+ "data.json",
644
+ config=ExtractionConfig(
645
+ json_config=JSONExtractionConfig(
646
+ extract_schema=True, # Extract JSON structure schema
647
+ include_type_info=True, # Add type annotations to output
648
+ flatten_nested_objects=True, # Flatten nested objects in output
649
+ custom_text_field_patterns=frozenset({"summary", "abstract"}), # Additional text fields
650
+ max_depth=10, # Maximum nesting depth for schema
651
+ array_item_limit=1000, # Limit array processing for performance
652
+ )
653
+ ),
654
+ )
655
+
656
+ # Access schema and nested attributes
657
+ if result.metadata.get("json_schema"):
658
+ print(f"JSON Schema: {result.metadata['json_schema']}")
659
+ if result.metadata.get("attributes"):
660
+ print(f"Nested fields: {result.metadata['attributes']}")
661
+ ```
662
+
663
+ #### Configuration File Support
664
+
665
+ Add JSON configuration to your `kreuzberg.toml`:
666
+
667
+ ```toml
668
+ [json_config]
669
+ extract_schema = true # Extract JSON structure schema
670
+ include_type_info = false # Add type annotations to output
671
+ flatten_nested_objects = true # Flatten nested objects in output
672
+ custom_text_field_patterns = ["summary", "abstract"] # Additional text fields to extract
673
+ max_depth = 10 # Maximum nesting depth for schema extraction
674
+ array_item_limit = 1000 # Limit array processing for performance
675
+ ```
676
+
677
+ #### Key Features
678
+
679
+ - **High Performance**: Uses msgspec for fast JSON parsing, significantly faster than standard library
680
+ - **Schema Extraction**: Automatically extracts the structure of your JSON data, useful for understanding complex documents
681
+ - **Custom Field Detection**: Configure additional text fields beyond defaults (title, name, description, content, body, text, message)
682
+ - **Type Information**: Optionally include data type annotations in extracted content for better understanding
683
+ - **Nested Object Control**: Choose between flattened or hierarchical output based on your needs
684
+ - **Memory Protection**: Array item limits prevent memory issues with large datasets
685
+
626
686
  ### Entity and Keyword Extraction
627
687
 
628
688
  Kreuzberg can extract named entities and keywords from documents using spaCy for entity recognition and KeyBERT for keyword extraction:
@@ -833,7 +893,14 @@ html_config = HTMLToMarkdownConfig(
833
893
  escape_underscores=False,
834
894
  wrap=True,
835
895
  wrap_width=100,
836
- preprocessing_preset="standard",
896
+ list_indent_width=2, # Discord/Slack compatible spacing
897
+ list_indent_type="spaces", # Use spaces for indentation
898
+ whitespace_mode="normalized", # Smart whitespace handling
899
+ br_in_tables=False, # Use spaces in table cells
900
+ highlight_style="double-equal", # ==highlighted== text style
901
+ newline_style="spaces", # Line break style
902
+ preprocess_html=True, # Clean HTML before conversion
903
+ preprocessing_preset="standard", # HTML cleaning level
837
904
  )
838
905
 
839
906
  result = await extract_file(
@@ -49,6 +49,54 @@ For PDF documents, Kreuzberg extracts a rich set of metadata including:
49
49
 
50
50
  If a PDF document contains UTF-16BE encoded strings (often present in PDF metadata with a byte order mark `\xfe\xff`), Kreuzberg will automatically detect and decode these properly.
51
51
 
52
+ ## Structured Data Metadata
53
+
54
+ For JSON, YAML, and TOML files, Kreuzberg provides specialized metadata extraction:
55
+
56
+ ### Text Field Detection
57
+
58
+ Kreuzberg automatically identifies and extracts common text fields:
59
+
60
+ - **Default fields**: `title`, `name`, `description`, `content`, `body`, `text`, `message`
61
+ - **Custom fields**: Configure additional patterns via `JSONExtractionConfig`
62
+
63
+ ### Nested Attributes
64
+
65
+ Complex nested fields are stored in `metadata.attributes` with dotted key notation:
66
+
67
+ ```python
68
+ from kreuzberg import extract_file_sync
69
+
70
+ # Example JSON with nested structure
71
+ result = extract_file_sync("complex.json")
72
+
73
+ # Access nested fields via attributes
74
+ if "attributes" in result.metadata:
75
+ # Nested fields like {"info": {"title": "Example"}} become "info.title"
76
+ nested_title = result.metadata["attributes"].get("info.title")
77
+
78
+ # Array items are indexed: {"items": [{"name": "first"}]} becomes "items[0].name"
79
+ first_item = result.metadata["attributes"].get("items[0].name")
80
+ ```
81
+
82
+ ### Schema Extraction
83
+
84
+ When enabled, Kreuzberg extracts the JSON structure:
85
+
86
+ ```python
87
+ from kreuzberg import extract_file_sync, ExtractionConfig, JSONExtractionConfig
88
+
89
+ config = ExtractionConfig(json_config=JSONExtractionConfig(extract_schema=True))
90
+ result = extract_file_sync("data.json", config=config)
91
+
92
+ # Access the schema
93
+ if "json_schema" in result.metadata:
94
+ schema = result.metadata["json_schema"]
95
+ print(f"Root type: {schema['type']}")
96
+ if "properties" in schema:
97
+ print(f"Properties: {list(schema['properties'].keys())}")
98
+ ```
99
+
52
100
  ## Working with Multiple Document Types
53
101
 
54
102
  When working with multiple document types, it's important to remember that different document formats may provide different metadata fields. Always use defensive programming (like using `.get()` with a default value) when accessing metadata fields:
@@ -57,6 +105,9 @@ When working with multiple document types, it's important to remember that diffe
57
105
  # Safe way to access metadata across different document types
58
106
  author = result.metadata.get("authors", ["Unknown"])[0] if "authors" in result.metadata else "Unknown"
59
107
  creation_date = result.metadata.get("created_at", "Unknown date")
108
+
109
+ # For structured data with nested attributes
110
+ nested_fields = result.metadata.get("attributes", {})
60
111
  ```
61
112
 
62
113
  ## Viewing Available Metadata
@@ -1,6 +1,6 @@
1
1
  # Supported Formats
2
2
 
3
- Kreuzberg handles a wide range of document, image, and text formats.
3
+ Kreuzberg handles a wide range of document, image, text, and structured data formats.
4
4
 
5
5
  ## Document Formats
6
6
 
@@ -36,6 +36,19 @@ Kreuzberg handles a wide range of document, image, and text formats.
36
36
  - EndNote and JATS XML (`.xml`)
37
37
  - RIS (`.ris`)
38
38
 
39
+ ## Structured Data Formats
40
+
41
+ - JSON (`.json`) - High-performance extraction using msgspec with schema analysis
42
+ - YAML (`.yaml`, `.yml`) - Full YAML 1.2 support with nested structure extraction
43
+ - TOML (`.toml`) - Configuration and metadata files with type-aware processing
44
+
45
+ These formats benefit from:
46
+
47
+ - **Schema extraction**: Automatically analyze and extract the structure of your data
48
+ - **Custom field detection**: Configure additional text fields for specialized extraction
49
+ - **Type information**: Optionally include data type annotations in extracted content
50
+ - **Performance optimization**: Uses msgspec for efficient JSON parsing
51
+
39
52
  ## Image Formats
40
53
 
41
54
  - JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
@@ -8,8 +8,10 @@ from ._types import (
8
8
  ExtractionConfig,
9
9
  ExtractionResult,
10
10
  GMFTConfig,
11
+ HTMLToMarkdownConfig,
11
12
  ImageOCRConfig,
12
13
  ImageOCRResult,
14
+ JSONExtractionConfig,
13
15
  LanguageDetectionConfig,
14
16
  Metadata,
15
17
  PaddleOCRConfig,
@@ -40,8 +42,10 @@ __all__ = [
40
42
  "ExtractionResult",
41
43
  "ExtractorRegistry",
42
44
  "GMFTConfig",
45
+ "HTMLToMarkdownConfig",
43
46
  "ImageOCRConfig",
44
47
  "ImageOCRResult",
48
+ "JSONExtractionConfig",
45
49
  "KreuzbergError",
46
50
  "LanguageDetectionConfig",
47
51
  "Metadata",
@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
13
13
 
14
14
  from kreuzberg import (
15
15
  EasyOCRConfig,
16
- ExtractedImage,
17
16
  ExtractionConfig,
18
17
  ExtractionResult,
19
- ImageOCRResult,
20
18
  KreuzbergError,
21
19
  MissingDependencyError,
22
20
  PaddleOCRConfig,
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
40
38
  from litestar.datastructures import UploadFile
41
39
 
42
40
 
43
- class ExtractedImageDict(TypedDict):
44
- """TypedDict for extracted image JSON representation."""
45
-
46
- data: str
47
- format: str
48
- filename: str | None
49
- page_number: int | None
50
- dimensions: tuple[int, int] | None
51
- colorspace: str | None
52
- bits_per_component: int | None
53
- is_mask: bool
54
- description: str | None
55
-
56
-
57
- class ImageOCRResultDict(TypedDict):
58
- """TypedDict for image OCR result JSON representation."""
59
-
60
- image: ExtractedImageDict
61
- ocr_result: Any
62
- confidence_score: float | None
63
- processing_time: float | None
64
- skipped_reason: str | None
65
-
66
-
67
41
  class HealthResponse(TypedDict):
68
42
  """Response model for health check endpoint."""
69
43
 
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
384
358
  return f"data:image/png;base64,{img_str}"
385
359
 
386
360
 
387
- def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
388
- encoded_data = base64.b64encode(obj.data).decode()
389
- return ExtractedImageDict(
390
- data=f"data:image/{obj.format};base64,{encoded_data}",
391
- format=obj.format,
392
- filename=obj.filename,
393
- page_number=obj.page_number,
394
- dimensions=obj.dimensions,
395
- colorspace=obj.colorspace,
396
- bits_per_component=obj.bits_per_component,
397
- is_mask=obj.is_mask,
398
- description=obj.description,
399
- )
400
-
401
-
402
- def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
403
- return ImageOCRResultDict(
404
- image=_extracted_image_encoder(obj.image),
405
- ocr_result=obj.ocr_result,
406
- confidence_score=obj.confidence_score,
407
- processing_time=obj.processing_time,
408
- skipped_reason=obj.skipped_reason,
409
- )
410
-
411
-
412
361
  openapi_config = OpenAPIConfig(
413
362
  title="Kreuzberg API",
414
363
  version="3.14.0",
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
428
377
  type_encoders = {
429
378
  pl.DataFrame: _polars_dataframe_encoder,
430
379
  Image.Image: _pil_image_encoder,
431
- ExtractedImage: _extracted_image_encoder,
432
- ImageOCRResult: _image_ocr_result_encoder,
433
380
  }
434
381
 
435
382
  app = Litestar(
@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
69
69
  try:
70
70
  match ocr_backend:
71
71
  case "tesseract":
72
- return TesseractConfig(**backend_args)
72
+ # Handle PSM mode conversion from int to enum
73
+ processed_args = backend_args.copy()
74
+ if "psm" in processed_args and isinstance(processed_args["psm"], int):
75
+ try:
76
+ processed_args["psm"] = PSMMode(processed_args["psm"])
77
+ except ValueError as e:
78
+ raise ValidationError(
79
+ f"Invalid PSM mode value: {processed_args['psm']}",
80
+ context={"psm_value": processed_args["psm"], "error": str(e)},
81
+ ) from e
82
+ return TesseractConfig(**processed_args)
73
83
  case "easyocr":
74
84
  return EasyOCRConfig(**backend_args)
75
85
  case "paddleocr":
@@ -132,7 +132,7 @@ def classify_document_from_layout(
132
132
  if not found_words.is_empty():
133
133
  scores[doc_type] += 1.0
134
134
  word_top = found_words[0, "top"]
135
- if word_top < page_height * 0.3:
135
+ if word_top is not None and word_top < page_height * 0.3:
136
136
  scores[doc_type] += 0.5
137
137
 
138
138
  total_score = sum(scores.values())
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
27
27
  html2text = None
28
28
 
29
29
  _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
30
+ _UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
31
+ _UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
30
32
 
31
33
 
32
34
  class EmailExtractor(Extractor):
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
86
88
  def _format_email_field(self, field: Any) -> str:
87
89
  match field:
88
90
  case list():
89
- return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
91
+ emails = []
92
+ for item in field:
93
+ if isinstance(item, dict):
94
+ if email := item.get("email", ""):
95
+ emails.append(str(email))
96
+ else:
97
+ emails.append(str(item))
98
+ return ", ".join(emails)
90
99
  case dict():
91
100
  return str(field.get("email", ""))
92
101
  case _:
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
111
120
  cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
112
121
  clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
113
122
  clean_html = unescape(clean_html)
114
- clean_html = (
115
- clean_html.replace("\u201c", '"')
116
- .replace("\u201d", '"')
117
- .replace("\u2019", "'")
118
- .replace("\u2018", "'")
119
- )
123
+ clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
124
+ clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
120
125
  text_parts.append(clean_html)
121
126
 
122
127
  def _extract_email_attachments(
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
129
134
  for att in attachments:
130
135
  name_val: str = "unknown"
131
136
  if isinstance(att, dict):
132
- n = att.get("name")
137
+ n = att.get("name") or att.get("filename")
133
138
  if isinstance(n, str) and n:
134
139
  name_val = n
135
140
  names.append(name_val)
136
- metadata["attachments"] = names
137
141
  if names:
142
+ metadata["attachments"] = names
138
143
  text_parts.append("Attachments: " + ", ".join(names))
139
144
 
140
145
  def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
151
156
  if not isinstance(mime, str) or not mime.startswith("image/"):
152
157
  continue
153
158
 
154
- name = att.get("name") if isinstance(att.get("name"), str) else None
159
+ name = att.get("name") or att.get("filename")
160
+ name = name if isinstance(name, str) else None
155
161
  data = att.get("data") or att.get("content") or att.get("payload")
156
162
  raw: bytes | None = None
157
163
  if isinstance(data, (bytes, bytearray)):