kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,313 @@
1
+ //! Integration tests for the /chunk API endpoint.
2
+
3
+ #![cfg(feature = "api")]
4
+
5
+ use axum::{
6
+ body::Body,
7
+ http::{Request, StatusCode},
8
+ };
9
+ use serde_json::json;
10
+ use tower::ServiceExt;
11
+
12
+ use kreuzberg::{ExtractionConfig, api::create_router};
13
+
14
+ #[tokio::test]
15
+ async fn test_chunk_basic() {
16
+ let app = create_router(ExtractionConfig::default());
17
+ let response = app
18
+ .oneshot(
19
+ Request::builder()
20
+ .uri("/chunk")
21
+ .method("POST")
22
+ .header("content-type", "application/json")
23
+ .body(Body::from(
24
+ json!({
25
+ "text": "Short text. More text here. Even more content to chunk."
26
+ })
27
+ .to_string(),
28
+ ))
29
+ .unwrap(),
30
+ )
31
+ .await
32
+ .unwrap();
33
+
34
+ assert_eq!(response.status(), StatusCode::OK);
35
+ }
36
+
37
+ #[tokio::test]
38
+ async fn test_chunk_empty_text_returns_400() {
39
+ let app = create_router(ExtractionConfig::default());
40
+ let response = app
41
+ .oneshot(
42
+ Request::builder()
43
+ .uri("/chunk")
44
+ .method("POST")
45
+ .header("content-type", "application/json")
46
+ .body(Body::from(json!({"text": ""}).to_string()))
47
+ .unwrap(),
48
+ )
49
+ .await
50
+ .unwrap();
51
+
52
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
53
+ }
54
+
55
+ #[tokio::test]
56
+ async fn test_chunk_markdown_strategy() {
57
+ let app = create_router(ExtractionConfig::default());
58
+ let response = app
59
+ .oneshot(
60
+ Request::builder()
61
+ .uri("/chunk")
62
+ .method("POST")
63
+ .header("content-type", "application/json")
64
+ .body(Body::from(
65
+ json!({
66
+ "text": "# Heading\n\nParagraph text here.",
67
+ "chunker_type": "markdown"
68
+ })
69
+ .to_string(),
70
+ ))
71
+ .unwrap(),
72
+ )
73
+ .await
74
+ .unwrap();
75
+
76
+ assert_eq!(response.status(), StatusCode::OK);
77
+ }
78
+
79
+ #[tokio::test]
80
+ async fn test_chunk_response_structure() {
81
+ use kreuzberg::api::ChunkResponse;
82
+
83
+ let app = create_router(ExtractionConfig::default());
84
+ let response = app
85
+ .oneshot(
86
+ Request::builder()
87
+ .uri("/chunk")
88
+ .method("POST")
89
+ .header("content-type", "application/json")
90
+ .body(Body::from(
91
+ json!({
92
+ "text": "This is a test. Another sentence here. And one more sentence to ensure we get chunks.",
93
+ "config": {
94
+ "max_characters": 50,
95
+ "overlap": 10,
96
+ "trim": true
97
+ },
98
+ "chunker_type": "text"
99
+ })
100
+ .to_string(),
101
+ ))
102
+ .unwrap(),
103
+ )
104
+ .await
105
+ .unwrap();
106
+
107
+ assert_eq!(response.status(), StatusCode::OK);
108
+
109
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
110
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
111
+
112
+ // Verify response structure
113
+ assert!(chunk_response.chunk_count > 0);
114
+ assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
115
+ assert_eq!(chunk_response.chunker_type, "text");
116
+ assert_eq!(chunk_response.config.max_characters, 50);
117
+ assert_eq!(chunk_response.config.overlap, 10);
118
+ assert!(chunk_response.config.trim);
119
+ assert!(chunk_response.input_size_bytes > 0);
120
+
121
+ // Verify chunk metadata
122
+ for (idx, chunk) in chunk_response.chunks.iter().enumerate() {
123
+ assert!(!chunk.content.is_empty());
124
+ assert_eq!(chunk.chunk_index, idx);
125
+ assert_eq!(chunk.total_chunks, chunk_response.chunk_count);
126
+ assert!(chunk.byte_end > chunk.byte_start);
127
+ }
128
+ }
129
+
130
+ #[tokio::test]
131
+ async fn test_chunk_invalid_strategy_returns_400() {
132
+ let app = create_router(ExtractionConfig::default());
133
+ let response = app
134
+ .oneshot(
135
+ Request::builder()
136
+ .uri("/chunk")
137
+ .method("POST")
138
+ .header("content-type", "application/json")
139
+ .body(Body::from(
140
+ json!({
141
+ "text": "Test text",
142
+ "chunker_type": "invalid_type"
143
+ })
144
+ .to_string(),
145
+ ))
146
+ .unwrap(),
147
+ )
148
+ .await
149
+ .unwrap();
150
+
151
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
152
+ }
153
+
154
+ #[tokio::test]
155
+ async fn test_chunk_with_defaults() {
156
+ use kreuzberg::api::ChunkResponse;
157
+
158
+ let app = create_router(ExtractionConfig::default());
159
+ let response = app
160
+ .oneshot(
161
+ Request::builder()
162
+ .uri("/chunk")
163
+ .method("POST")
164
+ .header("content-type", "application/json")
165
+ .body(Body::from(
166
+ json!({
167
+ "text": "This is a test sentence. Another sentence here."
168
+ })
169
+ .to_string(),
170
+ ))
171
+ .unwrap(),
172
+ )
173
+ .await
174
+ .unwrap();
175
+
176
+ assert_eq!(response.status(), StatusCode::OK);
177
+
178
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
179
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
180
+
181
+ // Verify defaults are applied
182
+ assert_eq!(chunk_response.config.max_characters, 2000);
183
+ assert_eq!(chunk_response.config.overlap, 100);
184
+ assert!(chunk_response.config.trim);
185
+ assert_eq!(chunk_response.chunker_type, "text");
186
+ }
187
+
188
+ #[tokio::test]
189
+ async fn test_chunk_malformed_json_returns_400() {
190
+ let app = create_router(ExtractionConfig::default());
191
+ let response = app
192
+ .oneshot(
193
+ Request::builder()
194
+ .uri("/chunk")
195
+ .method("POST")
196
+ .header("content-type", "application/json")
197
+ .body(Body::from("{invalid json}"))
198
+ .unwrap(),
199
+ )
200
+ .await
201
+ .unwrap();
202
+
203
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
204
+ }
205
+
206
+ #[tokio::test]
207
+ async fn test_chunk_case_insensitive_chunker_type() {
208
+ use kreuzberg::api::ChunkResponse;
209
+
210
+ let app = create_router(ExtractionConfig::default());
211
+ let response = app
212
+ .oneshot(
213
+ Request::builder()
214
+ .uri("/chunk")
215
+ .method("POST")
216
+ .header("content-type", "application/json")
217
+ .body(Body::from(
218
+ json!({
219
+ "text": "# Title\n\nContent here.",
220
+ "chunker_type": "MARKDOWN"
221
+ })
222
+ .to_string(),
223
+ ))
224
+ .unwrap(),
225
+ )
226
+ .await
227
+ .unwrap();
228
+
229
+ assert_eq!(response.status(), StatusCode::OK);
230
+
231
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
232
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
233
+
234
+ // Verify it's normalized to lowercase
235
+ assert_eq!(chunk_response.chunker_type, "markdown");
236
+ }
237
+
238
+ #[tokio::test]
239
+ async fn test_chunk_long_text() {
240
+ use kreuzberg::api::ChunkResponse;
241
+
242
+ let app = create_router(ExtractionConfig::default());
243
+ let long_text = "Lorem ipsum dolor sit amet. ".repeat(200);
244
+
245
+ let response = app
246
+ .oneshot(
247
+ Request::builder()
248
+ .uri("/chunk")
249
+ .method("POST")
250
+ .header("content-type", "application/json")
251
+ .body(Body::from(
252
+ json!({
253
+ "text": long_text,
254
+ "config": {
255
+ "max_characters": 500,
256
+ "overlap": 50
257
+ }
258
+ })
259
+ .to_string(),
260
+ ))
261
+ .unwrap(),
262
+ )
263
+ .await
264
+ .unwrap();
265
+
266
+ assert_eq!(response.status(), StatusCode::OK);
267
+
268
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
269
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
270
+
271
+ // Should have multiple chunks
272
+ assert!(chunk_response.chunk_count > 1);
273
+ assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
274
+ }
275
+
276
+ #[tokio::test]
277
+ async fn test_chunk_custom_config() {
278
+ use kreuzberg::api::ChunkResponse;
279
+
280
+ let app = create_router(ExtractionConfig::default());
281
+ let response = app
282
+ .oneshot(
283
+ Request::builder()
284
+ .uri("/chunk")
285
+ .method("POST")
286
+ .header("content-type", "application/json")
287
+ .body(Body::from(
288
+ json!({
289
+ "text": "Test sentence one. Test sentence two. Test sentence three.",
290
+ "config": {
291
+ "max_characters": 30,
292
+ "overlap": 5,
293
+ "trim": false
294
+ },
295
+ "chunker_type": "text"
296
+ })
297
+ .to_string(),
298
+ ))
299
+ .unwrap(),
300
+ )
301
+ .await
302
+ .unwrap();
303
+
304
+ assert_eq!(response.status(), StatusCode::OK);
305
+
306
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
307
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
308
+
309
+ // Verify custom config was applied
310
+ assert_eq!(chunk_response.config.max_characters, 30);
311
+ assert_eq!(chunk_response.config.overlap, 5);
312
+ assert!(!chunk_response.config.trim);
313
+ }
@@ -84,9 +84,8 @@ async fn test_embed_with_custom_config() {
84
84
  "texts": ["Test embedding with custom config"],
85
85
  "config": {
86
86
  "model": {
87
- "preset": {
88
- "name": "fast"
89
- }
87
+ "type": "preset",
88
+ "name": "fast"
90
89
  },
91
90
  "batch_size": 32
92
91
  }
@@ -295,9 +294,8 @@ async fn test_embed_different_presets() {
295
294
  "texts": ["Test text"],
296
295
  "config": {
297
296
  "model": {
298
- "preset": {
299
- "name": "fast"
300
- }
297
+ "type": "preset",
298
+ "name": "fast"
301
299
  }
302
300
  }
303
301
  });
@@ -327,9 +325,8 @@ async fn test_embed_different_presets() {
327
325
  "texts": ["Test text"],
328
326
  "config": {
329
327
  "model": {
330
- "preset": {
331
- "name": "balanced"
332
- }
328
+ "type": "preset",
329
+ "name": "balanced"
333
330
  }
334
331
  }
335
332
  });
@@ -257,6 +257,7 @@ fn test_ocr_multipage_efficiency() {
257
257
  backend: "tesseract".to_string(),
258
258
  language: "eng".to_string(),
259
259
  tesseract_config: None,
260
+ output_format: None,
260
261
  }),
261
262
  force_ocr: false,
262
263
  use_cache: true,
@@ -193,6 +193,7 @@ async fn test_concurrent_ocr_processing() {
193
193
  backend: "tesseract".to_string(),
194
194
  language: "eng".to_string(),
195
195
  tesseract_config: None,
196
+ output_format: None,
196
197
  }),
197
198
  force_ocr: false,
198
199
  use_cache: true,
@@ -262,6 +263,7 @@ fn test_concurrent_ocr_cache_stress() {
262
263
  backend: "tesseract".to_string(),
263
264
  language: "eng".to_string(),
264
265
  tesseract_config: None,
266
+ output_format: None,
265
267
  }),
266
268
  force_ocr: false,
267
269
  use_cache: true,
@@ -313,7 +315,10 @@ fn test_concurrent_ocr_cache_stress() {
313
315
  /// - Pipeline can process multiple results in parallel
314
316
  /// - Processors don't interfere with each other
315
317
  /// - Registry reads are thread-safe
318
+ ///
319
+ /// Note: This test is flaky due to timing-dependent concurrent operations.
316
320
  #[tokio::test]
321
+ #[ignore = "flaky concurrency test - timing dependent on system load"]
317
322
  async fn test_concurrent_pipeline_processing() {
318
323
  struct ConcurrentTestProcessor;
319
324
 
@@ -378,6 +383,8 @@ async fn test_concurrent_pipeline_processing() {
378
383
  chunks: None,
379
384
  images: None,
380
385
  pages: None,
386
+ elements: None,
387
+ djot_content: None,
381
388
  };
382
389
 
383
390
  run_pipeline(result, &config).await
@@ -459,6 +459,7 @@ async fn test_extraction_with_ocr_config() {
459
459
  tesseract_config: None,
460
460
  backend: "tesseract".to_string(),
461
461
  language: "eng".to_string(),
462
+ output_format: None,
462
463
  }),
463
464
  force_ocr: true,
464
465
  ..Default::default()
@@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {
120
120
 
121
121
  println!("✅ DOCX minimal metadata extraction test passed!");
122
122
  }
123
+
124
+ #[tokio::test]
125
+ async fn test_docx_keywords_extraction() {
126
+ // This test verifies that DOCX keywords metadata is properly parsed
127
+ // from comma-separated strings into Vec<String> in Metadata.keywords
128
+ //
129
+ // Addresses GitHub issue #309: DOCX keyword extraction was returning
130
+ // strings instead of parsed keyword lists, causing FunctionClauseError
131
+ // in the Elixir binding.
132
+
133
+ use std::io::Write;
134
+ use tempfile::NamedTempFile;
135
+ use zip::CompressionMethod;
136
+ use zip::write::{FileOptions, ZipWriter};
137
+
138
+ // Create a minimal DOCX with keywords metadata
139
+ let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
140
+
141
+ {
142
+ let mut zip = ZipWriter::new(&mut temp_file);
143
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
144
+
145
+ // Add [Content_Types].xml
146
+ zip.start_file("[Content_Types].xml", options).unwrap();
147
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
148
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
149
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
150
+ <Default Extension="xml" ContentType="application/xml"/>
151
+ <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
152
+ <Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
153
+ </Types>"#).unwrap();
154
+
155
+ // Add _rels/.rels
156
+ zip.start_file("_rels/.rels", options).unwrap();
157
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
158
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
159
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
160
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
161
+ </Relationships>"#).unwrap();
162
+
163
+ // Add word/document.xml with simple content
164
+ zip.start_file("word/document.xml", options).unwrap();
165
+ zip.write_all(
166
+ br#"<?xml version="1.0" encoding="UTF-8"?>
167
+ <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
168
+ <w:body>
169
+ <w:p>
170
+ <w:r>
171
+ <w:t>Test document for keyword extraction</w:t>
172
+ </w:r>
173
+ </w:p>
174
+ </w:body>
175
+ </w:document>"#,
176
+ )
177
+ .unwrap();
178
+
179
+ // Add docProps/core.xml with keywords (comma-separated string)
180
+ zip.start_file("docProps/core.xml", options).unwrap();
181
+ zip.write_all(
182
+ br#"<?xml version="1.0" encoding="UTF-8"?>
183
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
184
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
185
+ xmlns:dcterms="http://purl.org/dc/terms/">
186
+ <dc:title>Test Document</dc:title>
187
+ <dc:creator>Test Author</dc:creator>
188
+ <cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
189
+ <dc:subject>Testing keyword extraction</dc:subject>
190
+ </cp:coreProperties>"#,
191
+ )
192
+ .unwrap();
193
+
194
+ zip.finish().unwrap();
195
+ }
196
+
197
+ // Extract the DOCX file
198
+ let result = extract_file(
199
+ temp_file.path(),
200
+ Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
201
+ &ExtractionConfig::default(),
202
+ )
203
+ .await
204
+ .expect("Should extract DOCX with keywords successfully");
205
+
206
+ // Verify content was extracted
207
+ assert!(!result.content.is_empty(), "Content should not be empty");
208
+ assert!(
209
+ result.content.contains("Test document for keyword extraction"),
210
+ "Content should match document text"
211
+ );
212
+
213
+ // Verify keywords were parsed into Vec<String> in Metadata.keywords
214
+ assert!(
215
+ result.metadata.keywords.is_some(),
216
+ "Keywords should be present in metadata.keywords"
217
+ );
218
+
219
+ let keywords = result.metadata.keywords.as_ref().unwrap();
220
+ assert_eq!(
221
+ keywords.len(),
222
+ 5,
223
+ "Should have 5 keywords parsed from comma-separated string"
224
+ );
225
+
226
+ // Verify individual keywords were trimmed and parsed correctly
227
+ assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
228
+ assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
229
+ assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
230
+ assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
231
+ assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");
232
+
233
+ // Verify other metadata was also extracted
234
+ assert_eq!(
235
+ result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
236
+ Some("Test Author"),
237
+ "Should have correct creator"
238
+ );
239
+ assert_eq!(
240
+ result.metadata.additional.get("title").and_then(|v| v.as_str()),
241
+ Some("Test Document"),
242
+ "Should have correct title"
243
+ );
244
+ assert_eq!(
245
+ result.metadata.additional.get("subject").and_then(|v| v.as_str()),
246
+ Some("Testing keyword extraction"),
247
+ "Should have correct subject"
248
+ );
249
+
250
+ println!("✅ DOCX keywords extraction test passed!");
251
+ println!(" Extracted keywords: {:?}", keywords);
252
+ }
@@ -53,20 +53,14 @@ async fn test_native_epub_wasteland_extraction() {
53
53
  result.content.len()
54
54
  );
55
55
 
56
- assert!(
57
- result.metadata.additional.contains_key("title"),
58
- "Should extract title metadata"
59
- );
56
+ assert!(result.metadata.title.is_some(), "Should extract title metadata");
60
57
  assert_eq!(
61
- result.metadata.additional.get("title").and_then(|v| v.as_str()),
58
+ result.metadata.title.as_deref(),
62
59
  Some("The Waste Land"),
63
60
  "Should have correct title"
64
61
  );
65
62
 
66
- assert!(
67
- result.metadata.additional.contains_key("creator"),
68
- "Should extract creator metadata"
69
- );
63
+ assert!(result.metadata.authors.is_some(), "Should extract creator metadata");
70
64
 
71
65
  assert!(
72
66
  result.content.contains("April") || result.content.contains("cruellest"),
@@ -105,10 +99,7 @@ async fn test_native_epub_images_extraction() {
105
99
  result.content.len()
106
100
  );
107
101
 
108
- assert!(
109
- result.metadata.additional.contains_key("title"),
110
- "Should extract title metadata"
111
- );
102
+ assert!(result.metadata.title.is_some(), "Should extract title metadata");
112
103
 
113
104
  println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
114
105
  }
@@ -179,7 +170,7 @@ async fn test_native_epub2_cover_extraction() {
179
170
  );
180
171
 
181
172
  assert_eq!(
182
- result.metadata.additional.get("title").and_then(|v| v.as_str()),
173
+ result.metadata.title.as_deref(),
183
174
  Some("Pandoc EPUB Test"),
184
175
  "Should have correct title"
185
176
  );
@@ -100,6 +100,7 @@ async fn test_ocr_simple_english_image_async() {
100
100
  backend: "tesseract".to_string(),
101
101
  language: "eng".to_string(),
102
102
  tesseract_config: None,
103
+ output_format: None,
103
104
  }),
104
105
  force_ocr: true,
105
106
  ..Default::default()
@@ -142,6 +143,7 @@ async fn test_ocr_image_without_text_async() {
142
143
  backend: "tesseract".to_string(),
143
144
  language: "eng".to_string(),
144
145
  tesseract_config: None,
146
+ output_format: None,
145
147
  }),
146
148
  force_ocr: true,
147
149
  ..Default::default()
@@ -115,6 +115,7 @@ pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
115
115
  backend: "tesseract".to_string(),
116
116
  language: "eng".to_string(),
117
117
  tesseract_config: None,
118
+ output_format: None,
118
119
  }),
119
120
  force_ocr: false,
120
121
  ..Default::default()