kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,401 @@
1
+ //! Per-section validation functions.
2
+ //!
3
+ //! This module contains validation functions for individual configuration sections
4
+ //! and their specific parameters. Each function validates a specific aspect of
5
+ //! the configuration and returns detailed error messages when validation fails.
6
+
7
+ use crate::{KreuzbergError, Result};
8
+
9
+ /// Valid binarization methods for image preprocessing.
10
+ const VALID_BINARIZATION_METHODS: &[&str] = &["otsu", "adaptive", "sauvola"];
11
+
12
+ /// Valid token reduction levels.
13
+ const VALID_TOKEN_REDUCTION_LEVELS: &[&str] = &["off", "light", "moderate", "aggressive", "maximum"];
14
+
15
+ /// Valid OCR backends.
16
+ const VALID_OCR_BACKENDS: &[&str] = &["tesseract", "easyocr", "paddleocr"];
17
+
18
+ /// Common ISO 639-1 language codes (extended list).
19
+ /// Covers most major languages and variants used in document processing.
20
+ const VALID_LANGUAGE_CODES: &[&str] = &[
21
+ "en", "de", "fr", "es", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko", "bg", "cs", "da", "el", "et", "fi", "hu",
22
+ "lt", "lv", "ro", "sk", "sl", "sv", "uk", "ar", "hi", "th", "tr", "vi", "eng", "deu", "fra", "spa", "ita", "por",
23
+ "nld", "pol", "rus", "zho", "jpn", "kor", "ces", "dan", "ell", "est", "fin", "hun", "lit", "lav", "ron", "slk",
24
+ "slv", "swe", "tur",
25
+ ];
26
+
27
+ /// Valid tesseract PSM (Page Segmentation Mode) values.
28
+ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
29
+
30
+ /// Valid tesseract OEM (OCR Engine Mode) values.
31
+ const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
32
+
33
+ /// Valid output formats for tesseract.
34
+ const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
35
+
36
+ /// Validate a binarization method string.
37
+ ///
38
+ /// # Arguments
39
+ ///
40
+ /// * `method` - The binarization method to validate (e.g., "otsu", "adaptive", "sauvola")
41
+ ///
42
+ /// # Returns
43
+ ///
44
+ /// `Ok(())` if the method is valid, or a `ValidationError` with details about valid options.
45
+ ///
46
+ /// # Examples
47
+ ///
48
+ /// ```rust
49
+ /// use kreuzberg::core::config_validation::validate_binarization_method;
50
+ ///
51
+ /// assert!(validate_binarization_method("otsu").is_ok());
52
+ /// assert!(validate_binarization_method("adaptive").is_ok());
53
+ /// assert!(validate_binarization_method("invalid").is_err());
54
+ /// ```
55
+ pub fn validate_binarization_method(method: &str) -> Result<()> {
56
+ let method = method.to_lowercase();
57
+ if VALID_BINARIZATION_METHODS.contains(&method.as_str()) {
58
+ Ok(())
59
+ } else {
60
+ Err(KreuzbergError::Validation {
61
+ message: format!(
62
+ "Invalid binarization method '{}'. Valid options are: {}",
63
+ method,
64
+ VALID_BINARIZATION_METHODS.join(", ")
65
+ ),
66
+ source: None,
67
+ })
68
+ }
69
+ }
70
+
71
+ /// Validate a token reduction level string.
72
+ ///
73
+ /// # Arguments
74
+ ///
75
+ /// * `level` - The token reduction level to validate (e.g., "off", "light", "moderate")
76
+ ///
77
+ /// # Returns
78
+ ///
79
+ /// `Ok(())` if the level is valid, or a `ValidationError` with details about valid options.
80
+ ///
81
+ /// # Examples
82
+ ///
83
+ /// ```rust
84
+ /// use kreuzberg::core::config_validation::validate_token_reduction_level;
85
+ ///
86
+ /// assert!(validate_token_reduction_level("off").is_ok());
87
+ /// assert!(validate_token_reduction_level("moderate").is_ok());
88
+ /// assert!(validate_token_reduction_level("extreme").is_err());
89
+ /// ```
90
+ pub fn validate_token_reduction_level(level: &str) -> Result<()> {
91
+ let level = level.to_lowercase();
92
+ if VALID_TOKEN_REDUCTION_LEVELS.contains(&level.as_str()) {
93
+ Ok(())
94
+ } else {
95
+ Err(KreuzbergError::Validation {
96
+ message: format!(
97
+ "Invalid token reduction level '{}'. Valid options are: {}",
98
+ level,
99
+ VALID_TOKEN_REDUCTION_LEVELS.join(", ")
100
+ ),
101
+ source: None,
102
+ })
103
+ }
104
+ }
105
+
106
+ /// Validate an OCR backend string.
107
+ ///
108
+ /// # Arguments
109
+ ///
110
+ /// * `backend` - The OCR backend to validate (e.g., "tesseract", "easyocr", "paddleocr")
111
+ ///
112
+ /// # Returns
113
+ ///
114
+ /// `Ok(())` if the backend is valid, or a `ValidationError` with details about valid options.
115
+ ///
116
+ /// # Examples
117
+ ///
118
+ /// ```rust
119
+ /// use kreuzberg::core::config_validation::validate_ocr_backend;
120
+ ///
121
+ /// assert!(validate_ocr_backend("tesseract").is_ok());
122
+ /// assert!(validate_ocr_backend("easyocr").is_ok());
123
+ /// assert!(validate_ocr_backend("invalid").is_err());
124
+ /// ```
125
+ pub fn validate_ocr_backend(backend: &str) -> Result<()> {
126
+ let backend = backend.to_lowercase();
127
+ if VALID_OCR_BACKENDS.contains(&backend.as_str()) {
128
+ Ok(())
129
+ } else {
130
+ Err(KreuzbergError::Validation {
131
+ message: format!(
132
+ "Invalid OCR backend '{}'. Valid options are: {}",
133
+ backend,
134
+ VALID_OCR_BACKENDS.join(", ")
135
+ ),
136
+ source: None,
137
+ })
138
+ }
139
+ }
140
+
141
+ /// Validate a language code (ISO 639-1 or 639-3 format).
142
+ ///
143
+ /// Accepts both 2-letter ISO 639-1 codes (e.g., "en", "de") and
144
+ /// 3-letter ISO 639-3 codes (e.g., "eng", "deu") for broader compatibility.
145
+ ///
146
+ /// # Arguments
147
+ ///
148
+ /// * `code` - The language code to validate
149
+ ///
150
+ /// # Returns
151
+ ///
152
+ /// `Ok(())` if the code is valid, or a `ValidationError` indicating an invalid language code.
153
+ ///
154
+ /// # Examples
155
+ ///
156
+ /// ```rust
157
+ /// use kreuzberg::core::config_validation::validate_language_code;
158
+ ///
159
+ /// assert!(validate_language_code("en").is_ok());
160
+ /// assert!(validate_language_code("eng").is_ok());
161
+ /// assert!(validate_language_code("de").is_ok());
162
+ /// assert!(validate_language_code("deu").is_ok());
163
+ /// assert!(validate_language_code("invalid").is_err());
164
+ /// ```
165
+ pub fn validate_language_code(code: &str) -> Result<()> {
166
+ let code_lower = code.to_lowercase();
167
+
168
+ if VALID_LANGUAGE_CODES.contains(&code_lower.as_str()) {
169
+ return Ok(());
170
+ }
171
+
172
+ Err(KreuzbergError::Validation {
173
+ message: format!(
174
+ "Invalid language code '{}'. Use ISO 639-1 (2-letter, e.g., 'en', 'de') \
175
+ or ISO 639-3 (3-letter, e.g., 'eng', 'deu') codes. \
176
+ Common codes: en, de, fr, es, it, pt, nl, pl, ru, zh, ja, ko, ar, hi, th.",
177
+ code
178
+ ),
179
+ source: None,
180
+ })
181
+ }
182
+
183
+ /// Validate a tesseract Page Segmentation Mode (PSM).
184
+ ///
185
+ /// # Arguments
186
+ ///
187
+ /// * `psm` - The PSM value to validate (0-13)
188
+ ///
189
+ /// # Returns
190
+ ///
191
+ /// `Ok(())` if the PSM is valid, or a `ValidationError` with details about valid ranges.
192
+ ///
193
+ /// # Examples
194
+ ///
195
+ /// ```rust
196
+ /// use kreuzberg::core::config_validation::validate_tesseract_psm;
197
+ ///
198
+ /// assert!(validate_tesseract_psm(3).is_ok()); // Fully automatic
199
+ /// assert!(validate_tesseract_psm(6).is_ok()); // Single block of text
200
+ /// assert!(validate_tesseract_psm(14).is_err()); // Out of range
201
+ /// ```
202
+ pub fn validate_tesseract_psm(psm: i32) -> Result<()> {
203
+ if VALID_TESSERACT_PSM.contains(&psm) {
204
+ Ok(())
205
+ } else {
206
+ Err(KreuzbergError::Validation {
207
+ message: format!(
208
+ "Invalid tesseract PSM value '{}'. Valid range is 0-13. \
209
+ Common values: 3 (auto), 6 (single block), 11 (sparse text).",
210
+ psm
211
+ ),
212
+ source: None,
213
+ })
214
+ }
215
+ }
216
+
217
+ /// Validate a tesseract OCR Engine Mode (OEM).
218
+ ///
219
+ /// # Arguments
220
+ ///
221
+ /// * `oem` - The OEM value to validate (0-3)
222
+ ///
223
+ /// # Returns
224
+ ///
225
+ /// `Ok(())` if the OEM is valid, or a `ValidationError` with details about valid options.
226
+ ///
227
+ /// # Examples
228
+ ///
229
+ /// ```rust
230
+ /// use kreuzberg::core::config_validation::validate_tesseract_oem;
231
+ ///
232
+ /// assert!(validate_tesseract_oem(1).is_ok()); // Neural nets (LSTM)
233
+ /// assert!(validate_tesseract_oem(2).is_ok()); // Legacy + LSTM
234
+ /// assert!(validate_tesseract_oem(4).is_err()); // Out of range
235
+ /// ```
236
+ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
237
+ if VALID_TESSERACT_OEM.contains(&oem) {
238
+ Ok(())
239
+ } else {
240
+ Err(KreuzbergError::Validation {
241
+ message: format!(
242
+ "Invalid tesseract OEM value '{}'. Valid range is 0-3. \
243
+ 0=Legacy, 1=LSTM, 2=Legacy+LSTM, 3=Default",
244
+ oem
245
+ ),
246
+ source: None,
247
+ })
248
+ }
249
+ }
250
+
251
+ /// Validate a tesseract output format.
252
+ ///
253
+ /// # Arguments
254
+ ///
255
+ /// * `format` - The output format to validate (e.g., "text", "markdown")
256
+ ///
257
+ /// # Returns
258
+ ///
259
+ /// `Ok(())` if the format is valid, or a `ValidationError` with details about valid options.
260
+ ///
261
+ /// # Examples
262
+ ///
263
+ /// ```rust
264
+ /// use kreuzberg::core::config_validation::validate_output_format;
265
+ ///
266
+ /// assert!(validate_output_format("text").is_ok());
267
+ /// assert!(validate_output_format("markdown").is_ok());
268
+ /// assert!(validate_output_format("json").is_err());
269
+ /// ```
270
+ pub fn validate_output_format(format: &str) -> Result<()> {
271
+ let format = format.to_lowercase();
272
+ if VALID_OUTPUT_FORMATS.contains(&format.as_str()) {
273
+ Ok(())
274
+ } else {
275
+ Err(KreuzbergError::Validation {
276
+ message: format!(
277
+ "Invalid output format '{}'. Valid options are: {}",
278
+ format,
279
+ VALID_OUTPUT_FORMATS.join(", ")
280
+ ),
281
+ source: None,
282
+ })
283
+ }
284
+ }
285
+
286
+ /// Validate a confidence threshold value.
287
+ ///
288
+ /// Confidence thresholds should be between 0.0 and 1.0 inclusive.
289
+ ///
290
+ /// # Arguments
291
+ ///
292
+ /// * `confidence` - The confidence threshold to validate
293
+ ///
294
+ /// # Returns
295
+ ///
296
+ /// `Ok(())` if the confidence is valid, or a `ValidationError` with details about valid ranges.
297
+ ///
298
+ /// # Examples
299
+ ///
300
+ /// ```rust
301
+ /// use kreuzberg::core::config_validation::validate_confidence;
302
+ ///
303
+ /// assert!(validate_confidence(0.5).is_ok());
304
+ /// assert!(validate_confidence(0.0).is_ok());
305
+ /// assert!(validate_confidence(1.0).is_ok());
306
+ /// assert!(validate_confidence(1.5).is_err());
307
+ /// assert!(validate_confidence(-0.1).is_err());
308
+ /// ```
309
+ pub fn validate_confidence(confidence: f64) -> Result<()> {
310
+ if (0.0..=1.0).contains(&confidence) {
311
+ Ok(())
312
+ } else {
313
+ Err(KreuzbergError::Validation {
314
+ message: format!(
315
+ "Invalid confidence threshold '{}'. Must be between 0.0 and 1.0.",
316
+ confidence
317
+ ),
318
+ source: None,
319
+ })
320
+ }
321
+ }
322
+
323
+ /// Validate a DPI (dots per inch) value.
324
+ ///
325
+ /// DPI should be a positive integer, typically 72-600.
326
+ ///
327
+ /// # Arguments
328
+ ///
329
+ /// * `dpi` - The DPI value to validate
330
+ ///
331
+ /// # Returns
332
+ ///
333
+ /// `Ok(())` if the DPI is valid, or a `ValidationError` with details about valid ranges.
334
+ ///
335
+ /// # Examples
336
+ ///
337
+ /// ```rust
338
+ /// use kreuzberg::core::config_validation::validate_dpi;
339
+ ///
340
+ /// assert!(validate_dpi(96).is_ok());
341
+ /// assert!(validate_dpi(300).is_ok());
342
+ /// assert!(validate_dpi(0).is_err());
343
+ /// assert!(validate_dpi(-1).is_err());
344
+ /// ```
345
+ pub fn validate_dpi(dpi: i32) -> Result<()> {
346
+ if dpi > 0 && dpi <= 2400 {
347
+ Ok(())
348
+ } else {
349
+ Err(KreuzbergError::Validation {
350
+ message: format!(
351
+ "Invalid DPI value '{}'. Must be a positive integer, typically 72-600.",
352
+ dpi
353
+ ),
354
+ source: None,
355
+ })
356
+ }
357
+ }
358
+
359
+ /// Validate chunk size parameters.
360
+ ///
361
+ /// Checks that max_chars > 0 and max_overlap < max_chars.
362
+ ///
363
+ /// # Arguments
364
+ ///
365
+ /// * `max_chars` - The maximum characters per chunk
366
+ /// * `max_overlap` - The maximum overlap between chunks
367
+ ///
368
+ /// # Returns
369
+ ///
370
+ /// `Ok(())` if the parameters are valid, or a `ValidationError` with details about constraints.
371
+ ///
372
+ /// # Examples
373
+ ///
374
+ /// ```rust
375
+ /// use kreuzberg::core::config_validation::validate_chunking_params;
376
+ ///
377
+ /// assert!(validate_chunking_params(1000, 200).is_ok());
378
+ /// assert!(validate_chunking_params(500, 50).is_ok());
379
+ /// assert!(validate_chunking_params(0, 100).is_err()); // max_chars must be > 0
380
+ /// assert!(validate_chunking_params(100, 150).is_err()); // overlap >= max_chars
381
+ /// ```
382
+ pub fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<()> {
383
+ if max_chars == 0 {
384
+ return Err(KreuzbergError::Validation {
385
+ message: "max_chars must be greater than 0".to_string(),
386
+ source: None,
387
+ });
388
+ }
389
+
390
+ if max_overlap >= max_chars {
391
+ return Err(KreuzbergError::Validation {
392
+ message: format!(
393
+ "max_overlap ({}) must be less than max_chars ({})",
394
+ max_overlap, max_chars
395
+ ),
396
+ source: None,
397
+ });
398
+ }
399
+
400
+ Ok(())
401
+ }
@@ -0,0 +1,246 @@
1
+ //! Batch extraction operations for concurrent processing.
2
+ //!
3
+ //! This module provides parallel extraction capabilities for processing
4
+ //! multiple files or byte arrays concurrently with automatic resource management.
5
+
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::types::{ErrorMetadata, ExtractionResult, Metadata};
8
+ use crate::{KreuzbergError, Result};
9
+ use std::path::Path;
10
+ use std::sync::Arc;
11
+
12
+ use super::bytes::extract_bytes;
13
+ use super::file::extract_file;
14
+
15
+ /// Extract content from multiple files concurrently.
16
+ ///
17
+ /// This function processes multiple files in parallel, automatically managing
18
+ /// concurrency to prevent resource exhaustion. The concurrency limit can be
19
+ /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
20
+ /// to `num_cpus * 2`.
21
+ ///
22
+ /// # Arguments
23
+ ///
24
+ /// * `paths` - Vector of file paths to extract
25
+ /// * `config` - Extraction configuration
26
+ ///
27
+ /// # Returns
28
+ ///
29
+ /// A vector of `ExtractionResult` in the same order as the input paths.
30
+ ///
31
+ /// # Errors
32
+ ///
33
+ /// Individual file errors are captured in the result metadata. System errors
34
+ /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
35
+ ///
36
+ /// # Example
37
+ ///
38
+ /// ```rust,no_run
39
+ /// use kreuzberg::core::extractor::batch_extract_file;
40
+ /// use kreuzberg::core::config::ExtractionConfig;
41
+ ///
42
+ /// # async fn example() -> kreuzberg::Result<()> {
43
+ /// let config = ExtractionConfig::default();
44
+ /// let paths = vec!["doc1.pdf", "doc2.pdf"];
45
+ /// let results = batch_extract_file(paths, &config).await?;
46
+ /// println!("Processed {} files", results.len());
47
+ /// # Ok(())
48
+ /// # }
49
+ /// ```
50
+ #[cfg(feature = "tokio-runtime")]
51
+ #[cfg_attr(feature = "otel", tracing::instrument(
52
+ skip(config, paths),
53
+ fields(
54
+ extraction.batch_size = paths.len(),
55
+ )
56
+ ))]
57
+ pub async fn batch_extract_file(
58
+ paths: Vec<impl AsRef<Path>>,
59
+ config: &ExtractionConfig,
60
+ ) -> Result<Vec<ExtractionResult>> {
61
+ use tokio::sync::Semaphore;
62
+ use tokio::task::JoinSet;
63
+
64
+ if paths.is_empty() {
65
+ return Ok(vec![]);
66
+ }
67
+
68
+ let config = Arc::new(config.clone());
69
+
70
+ let max_concurrent = config
71
+ .max_concurrent_extractions
72
+ .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
73
+ let semaphore = Arc::new(Semaphore::new(max_concurrent));
74
+
75
+ let mut tasks = JoinSet::new();
76
+
77
+ for (index, path) in paths.into_iter().enumerate() {
78
+ let path_buf = path.as_ref().to_path_buf();
79
+ let config_clone = Arc::clone(&config);
80
+ let semaphore_clone = Arc::clone(&semaphore);
81
+
82
+ tasks.spawn(async move {
83
+ let _permit = semaphore_clone.acquire().await.unwrap();
84
+ let result =
85
+ crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
86
+ .await;
87
+ (index, result)
88
+ });
89
+ }
90
+
91
+ let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
92
+
93
+ while let Some(task_result) = tasks.join_next().await {
94
+ match task_result {
95
+ Ok((index, Ok(result))) => {
96
+ results[index] = Some(result);
97
+ }
98
+ Ok((index, Err(e))) => {
99
+ // All errors (including Io) should create error results
100
+ // instead of causing early return that abandons running tasks
101
+ let metadata = Metadata {
102
+ error: Some(ErrorMetadata {
103
+ error_type: format!("{:?}", e),
104
+ message: e.to_string(),
105
+ }),
106
+ ..Default::default()
107
+ };
108
+
109
+ results[index] = Some(ExtractionResult {
110
+ content: format!("Error: {}", e),
111
+ mime_type: "text/plain".to_string(),
112
+ metadata,
113
+ tables: vec![],
114
+ detected_languages: None,
115
+ chunks: None,
116
+ images: None,
117
+ djot_content: None,
118
+ pages: None,
119
+ elements: None,
120
+ });
121
+ }
122
+ Err(join_err) => {
123
+ return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
124
+ }
125
+ }
126
+ }
127
+
128
+ #[allow(clippy::unwrap_used)]
129
+ Ok(results.into_iter().map(|r| r.unwrap()).collect())
130
+ }
131
+
132
+ /// Extract content from multiple byte arrays concurrently.
133
+ ///
134
+ /// This function processes multiple byte arrays in parallel, automatically managing
135
+ /// concurrency to prevent resource exhaustion. The concurrency limit can be
136
+ /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
137
+ /// to `num_cpus * 2`.
138
+ ///
139
+ /// # Arguments
140
+ ///
141
+ /// * `contents` - Vector of (bytes, mime_type) tuples
142
+ /// * `config` - Extraction configuration
143
+ ///
144
+ /// # Returns
145
+ ///
146
+ /// A vector of `ExtractionResult` in the same order as the input.
147
+ ///
148
+ /// # Example
149
+ ///
150
+ /// ```rust,no_run
151
+ /// use kreuzberg::core::extractor::batch_extract_bytes;
152
+ /// use kreuzberg::core::config::ExtractionConfig;
153
+ ///
154
+ /// # async fn example() -> kreuzberg::Result<()> {
155
+ /// let config = ExtractionConfig::default();
156
+ /// let contents = vec![
157
+ /// (b"content 1".to_vec(), "text/plain".to_string()),
158
+ /// (b"content 2".to_vec(), "text/plain".to_string()),
159
+ /// ];
160
+ /// let results = batch_extract_bytes(contents, &config).await?;
161
+ /// println!("Processed {} items", results.len());
162
+ /// # Ok(())
163
+ /// # }
164
+ /// ```
165
+ #[cfg(feature = "tokio-runtime")]
166
+ #[cfg_attr(feature = "otel", tracing::instrument(
167
+ skip(config, contents),
168
+ fields(
169
+ extraction.batch_size = contents.len(),
170
+ )
171
+ ))]
172
+ pub async fn batch_extract_bytes(
173
+ contents: Vec<(Vec<u8>, String)>,
174
+ config: &ExtractionConfig,
175
+ ) -> Result<Vec<ExtractionResult>> {
176
+ use tokio::sync::Semaphore;
177
+ use tokio::task::JoinSet;
178
+
179
+ if contents.is_empty() {
180
+ return Ok(vec![]);
181
+ }
182
+
183
+ let batch_config = config.clone();
184
+ let config = Arc::new(batch_config);
185
+
186
+ let max_concurrent = config
187
+ .max_concurrent_extractions
188
+ .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
189
+ let semaphore = Arc::new(Semaphore::new(max_concurrent));
190
+
191
+ let mut tasks = JoinSet::new();
192
+
193
+ for (index, (bytes, mime_type)) in contents.into_iter().enumerate() {
194
+ let config_clone = Arc::clone(&config);
195
+ let semaphore_clone = Arc::clone(&semaphore);
196
+
197
+ tasks.spawn(async move {
198
+ let _permit = semaphore_clone.acquire().await.unwrap();
199
+ let result = crate::core::batch_mode::with_batch_mode(async {
200
+ extract_bytes(&bytes, &mime_type, &config_clone).await
201
+ })
202
+ .await;
203
+ (index, result)
204
+ });
205
+ }
206
+
207
+ let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
208
+
209
+ while let Some(task_result) = tasks.join_next().await {
210
+ match task_result {
211
+ Ok((index, Ok(result))) => {
212
+ results[index] = Some(result);
213
+ }
214
+ Ok((index, Err(e))) => {
215
+ // All errors (including Io) should create error results
216
+ // instead of causing early return that abandons running tasks
217
+ let metadata = Metadata {
218
+ error: Some(ErrorMetadata {
219
+ error_type: format!("{:?}", e),
220
+ message: e.to_string(),
221
+ }),
222
+ ..Default::default()
223
+ };
224
+
225
+ results[index] = Some(ExtractionResult {
226
+ content: format!("Error: {}", e),
227
+ mime_type: "text/plain".to_string(),
228
+ metadata,
229
+ tables: vec![],
230
+ detected_languages: None,
231
+ chunks: None,
232
+ images: None,
233
+ djot_content: None,
234
+ pages: None,
235
+ elements: None,
236
+ });
237
+ }
238
+ Err(join_err) => {
239
+ return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
240
+ }
241
+ }
242
+ }
243
+
244
+ #[allow(clippy::unwrap_used)]
245
+ Ok(results.into_iter().map(|r| r.unwrap()).collect())
246
+ }