kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -47,352 +47,11 @@ pub fn string_to_c_string(value: String) -> std::result::Result<*mut c_char, Str
47
47
 
48
48
  /// Parse extraction configuration from JSON string
49
49
  pub fn parse_extraction_config_from_json(config_str: &str) -> FfiResult<ExtractionConfig> {
50
- use html_to_markdown_rs::options::{
51
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
52
- PreprocessingPreset, WhitespaceMode,
53
- };
54
-
55
- fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
56
- where
57
- F: Fn(&str) -> std::result::Result<T, String>,
58
- {
59
- if let Some(raw) = value {
60
- let text = raw
61
- .as_str()
62
- .ok_or_else(|| "Expected string for html_options enum field".to_string())?;
63
- return parse_fn(text).map(Some);
64
- }
65
- Ok(None)
66
- }
67
-
68
- fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
69
- match value.to_lowercase().as_str() {
70
- "atx" => Ok(HeadingStyle::Atx),
71
- "underlined" => Ok(HeadingStyle::Underlined),
72
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
73
- other => Err(format!(
74
- "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
75
- other
76
- )),
77
- }
78
- }
79
-
80
- fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
81
- match value.to_lowercase().as_str() {
82
- "spaces" => Ok(ListIndentType::Spaces),
83
- "tabs" => Ok(ListIndentType::Tabs),
84
- other => Err(format!(
85
- "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
86
- other
87
- )),
88
- }
89
- }
90
-
91
- fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
92
- match value.to_lowercase().as_str() {
93
- "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
94
- "html" => Ok(HighlightStyle::Html),
95
- "bold" => Ok(HighlightStyle::Bold),
96
- "none" => Ok(HighlightStyle::None),
97
- other => Err(format!(
98
- "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
99
- other
100
- )),
101
- }
102
- }
103
-
104
- fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
105
- match value.to_lowercase().as_str() {
106
- "normalized" => Ok(WhitespaceMode::Normalized),
107
- "strict" => Ok(WhitespaceMode::Strict),
108
- other => Err(format!(
109
- "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
110
- other
111
- )),
112
- }
113
- }
114
-
115
- fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
116
- match value.to_lowercase().as_str() {
117
- "spaces" => Ok(NewlineStyle::Spaces),
118
- "backslash" => Ok(NewlineStyle::Backslash),
119
- other => Err(format!(
120
- "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
121
- other
122
- )),
123
- }
124
- }
125
-
126
- fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
127
- match value.to_lowercase().as_str() {
128
- "indented" => Ok(CodeBlockStyle::Indented),
129
- "backticks" => Ok(CodeBlockStyle::Backticks),
130
- "tildes" => Ok(CodeBlockStyle::Tildes),
131
- other => Err(format!(
132
- "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
133
- other
134
- )),
135
- }
136
- }
137
-
138
- fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
139
- match value.to_lowercase().as_str() {
140
- "minimal" => Ok(PreprocessingPreset::Minimal),
141
- "standard" => Ok(PreprocessingPreset::Standard),
142
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
143
- other => Err(format!(
144
- "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
145
- other
146
- )),
147
- }
148
- }
149
-
150
- fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
151
- let mut opts = ConversionOptions::default();
152
- let obj = value
153
- .as_object()
154
- .ok_or_else(|| "html_options must be an object".to_string())?;
155
-
156
- if let Some(val) = obj.get("heading_style") {
157
- opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
158
- }
159
-
160
- if let Some(val) = obj.get("list_indent_type") {
161
- opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
162
- }
163
-
164
- if let Some(val) = obj.get("list_indent_width") {
165
- opts.list_indent_width = val
166
- .as_u64()
167
- .map(|v| v as usize)
168
- .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
169
- }
170
-
171
- if let Some(val) = obj.get("bullets") {
172
- opts.bullets = val
173
- .as_str()
174
- .map(str::to_string)
175
- .ok_or_else(|| "bullets must be a string".to_string())?;
176
- }
177
-
178
- if let Some(val) = obj.get("strong_em_symbol") {
179
- let symbol = val
180
- .as_str()
181
- .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
182
- let mut chars = symbol.chars();
183
- opts.strong_em_symbol = chars
184
- .next()
185
- .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
186
- }
187
-
188
- if let Some(val) = obj.get("escape_asterisks") {
189
- opts.escape_asterisks = val
190
- .as_bool()
191
- .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
192
- }
193
- if let Some(val) = obj.get("escape_underscores") {
194
- opts.escape_underscores = val
195
- .as_bool()
196
- .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
197
- }
198
- if let Some(val) = obj.get("escape_misc") {
199
- opts.escape_misc = val
200
- .as_bool()
201
- .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
202
- }
203
- if let Some(val) = obj.get("escape_ascii") {
204
- opts.escape_ascii = val
205
- .as_bool()
206
- .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
207
- }
208
-
209
- if let Some(val) = obj.get("code_language") {
210
- opts.code_language = val
211
- .as_str()
212
- .map(str::to_string)
213
- .ok_or_else(|| "code_language must be a string".to_string())?;
214
- }
215
-
216
- if let Some(val) = obj.get("autolinks") {
217
- opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
218
- }
219
-
220
- if let Some(val) = obj.get("default_title") {
221
- opts.default_title = val
222
- .as_bool()
223
- .ok_or_else(|| "default_title must be a boolean".to_string())?;
224
- }
225
-
226
- if let Some(val) = obj.get("br_in_tables") {
227
- opts.br_in_tables = val
228
- .as_bool()
229
- .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
230
- }
231
-
232
- if let Some(val) = obj.get("hocr_spatial_tables") {
233
- opts.hocr_spatial_tables = val
234
- .as_bool()
235
- .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
236
- }
237
-
238
- if let Some(val) = obj.get("highlight_style") {
239
- opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
240
- }
241
-
242
- if let Some(val) = obj.get("extract_metadata") {
243
- opts.extract_metadata = val
244
- .as_bool()
245
- .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
246
- }
247
-
248
- if let Some(val) = obj.get("whitespace_mode") {
249
- opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
250
- }
251
-
252
- if let Some(val) = obj.get("strip_newlines") {
253
- opts.strip_newlines = val
254
- .as_bool()
255
- .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
256
- }
257
-
258
- if let Some(val) = obj.get("wrap") {
259
- opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
260
- }
261
-
262
- if let Some(val) = obj.get("wrap_width") {
263
- opts.wrap_width = val
264
- .as_u64()
265
- .map(|v| v as usize)
266
- .ok_or_else(|| "wrap_width must be an integer".to_string())?;
267
- }
268
-
269
- if let Some(val) = obj.get("convert_as_inline") {
270
- opts.convert_as_inline = val
271
- .as_bool()
272
- .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
273
- }
274
-
275
- if let Some(val) = obj.get("sub_symbol") {
276
- opts.sub_symbol = val
277
- .as_str()
278
- .map(str::to_string)
279
- .ok_or_else(|| "sub_symbol must be a string".to_string())?;
280
- }
281
-
282
- if let Some(val) = obj.get("sup_symbol") {
283
- opts.sup_symbol = val
284
- .as_str()
285
- .map(str::to_string)
286
- .ok_or_else(|| "sup_symbol must be a string".to_string())?;
287
- }
288
-
289
- if let Some(val) = obj.get("newline_style") {
290
- opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
291
- }
292
-
293
- if let Some(val) = obj.get("code_block_style") {
294
- opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
295
- }
296
-
297
- if let Some(val) = obj.get("keep_inline_images_in") {
298
- opts.keep_inline_images_in = val
299
- .as_array()
300
- .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
301
- .iter()
302
- .map(|v| {
303
- v.as_str()
304
- .map(str::to_string)
305
- .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
306
- })
307
- .collect::<std::result::Result<Vec<_>, _>>()?;
308
- }
309
-
310
- if let Some(val) = obj.get("encoding") {
311
- opts.encoding = val
312
- .as_str()
313
- .map(str::to_string)
314
- .ok_or_else(|| "encoding must be a string".to_string())?;
315
- }
316
-
317
- if let Some(val) = obj.get("debug") {
318
- opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
319
- }
320
-
321
- if let Some(val) = obj.get("strip_tags") {
322
- opts.strip_tags = val
323
- .as_array()
324
- .ok_or_else(|| "strip_tags must be an array".to_string())?
325
- .iter()
326
- .map(|v| {
327
- v.as_str()
328
- .map(str::to_string)
329
- .ok_or_else(|| "strip_tags entries must be strings".to_string())
330
- })
331
- .collect::<std::result::Result<Vec<_>, _>>()?;
332
- }
333
-
334
- if let Some(val) = obj.get("preserve_tags") {
335
- opts.preserve_tags = val
336
- .as_array()
337
- .ok_or_else(|| "preserve_tags must be an array".to_string())?
338
- .iter()
339
- .map(|v| {
340
- v.as_str()
341
- .map(str::to_string)
342
- .ok_or_else(|| "preserve_tags entries must be strings".to_string())
343
- })
344
- .collect::<std::result::Result<Vec<_>, _>>()?;
345
- }
346
-
347
- if let Some(val) = obj.get("preprocessing") {
348
- let pre = val
349
- .as_object()
350
- .ok_or_else(|| "preprocessing must be an object".to_string())?;
351
- let mut preprocessing = opts.preprocessing.clone();
352
-
353
- if let Some(v) = pre.get("enabled") {
354
- preprocessing.enabled = v
355
- .as_bool()
356
- .ok_or_else(|| "preprocessing.enabled must be a boolean".to_string())?;
357
- }
358
-
359
- if let Some(v) = pre.get("preset") {
360
- let preset = v
361
- .as_str()
362
- .ok_or_else(|| "preprocessing.preset must be a string".to_string())?;
363
- preprocessing.preset = parse_preprocessing_preset(preset)?;
364
- }
365
-
366
- if let Some(v) = pre.get("remove_navigation") {
367
- preprocessing.remove_navigation = v
368
- .as_bool()
369
- .ok_or_else(|| "preprocessing.remove_navigation must be a boolean".to_string())?;
370
- }
371
-
372
- if let Some(v) = pre.get("remove_forms") {
373
- preprocessing.remove_forms = v
374
- .as_bool()
375
- .ok_or_else(|| "preprocessing.remove_forms must be a boolean".to_string())?;
376
- }
377
-
378
- opts.preprocessing = preprocessing;
379
- }
380
-
381
- Ok(opts)
382
- }
383
-
384
- let value: serde_json::Value =
50
+ // html-to-markdown-rs v2.22.5+ has #[serde(default)] on ConversionOptions,
51
+ // so serde can now handle partial deserialization with defaults for missing fields
52
+ let config: ExtractionConfig =
385
53
  serde_json::from_str(config_str).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
386
54
 
387
- let html_options = value.get("html_options").map(parse_html_options).transpose()?;
388
-
389
- let mut config: ExtractionConfig =
390
- serde_json::from_value(value).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
391
-
392
- if let Some(options) = html_options {
393
- config.html_options = Some(options);
394
- }
395
-
396
55
  Ok(config)
397
56
  }
398
57
 
@@ -407,6 +66,8 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
407
66
  chunks,
408
67
  images,
409
68
  pages,
69
+ djot_content: _,
70
+ elements: _,
410
71
  } = result;
411
72
 
412
73
  let sanitized_content = if content.contains('\0') {
@@ -596,8 +257,8 @@ mod tests {
596
257
  fn test_parse_extraction_config_with_html_options() {
597
258
  let json = r#"{
598
259
  "html_options": {
599
- "heading_style": "atx",
600
- "escape_asterisks": true,
260
+ "headingStyle": "atx",
261
+ "escapeAsterisks": true,
601
262
  "autolinks": false
602
263
  }
603
264
  }"#;
@@ -620,16 +281,20 @@ mod tests {
620
281
  }
621
282
 
622
283
  #[test]
623
- fn test_parse_extraction_config_invalid_heading_style() {
284
+ fn test_parse_extraction_config_invalid_heading_style_uses_default() {
285
+ // With #[serde(default)], invalid enum values use the default instead of failing
624
286
  let json = r#"{
625
287
  "html_options": {
626
- "heading_style": "invalid_style"
288
+ "headingStyle": "invalid_style"
627
289
  }
628
290
  }"#;
629
291
 
630
292
  let result = parse_extraction_config_from_json(json);
631
- assert!(result.is_err());
632
- assert!(result.unwrap_err().contains("Invalid heading_style"));
293
+ assert!(result.is_ok(), "Parsing should succeed with default values");
294
+
295
+ // Invalid enum values should be ignored and default value used
296
+ let config = result.unwrap();
297
+ assert!(config.html_options.is_some());
633
298
  }
634
299
 
635
300
  #[test]
@@ -641,7 +306,7 @@ mod tests {
641
306
  ];
642
307
 
643
308
  for (input, _expected) in styles {
644
- let json = format!(r#"{{"html_options": {{"heading_style": "{}"}}}}"#, input);
309
+ let json = format!(r#"{{"html_options": {{"headingStyle": "{}"}}}}"#, input);
645
310
  let result = parse_extraction_config_from_json(&json);
646
311
  assert!(result.is_ok(), "Failed to parse heading_style: {}", input);
647
312
  }
@@ -654,8 +319,8 @@ mod tests {
654
319
  "preprocessing": {
655
320
  "enabled": true,
656
321
  "preset": "aggressive",
657
- "remove_navigation": true,
658
- "remove_forms": false
322
+ "removeNavigation": true,
323
+ "removeForms": false
659
324
  }
660
325
  }
661
326
  }"#;
@@ -682,6 +347,8 @@ mod tests {
682
347
  chunks: None,
683
348
  images: None,
684
349
  pages: None,
350
+ djot_content: None,
351
+ elements: None,
685
352
  };
686
353
 
687
354
  let c_result = to_c_extraction_result(result);
@@ -719,6 +386,8 @@ mod tests {
719
386
  chunks: None,
720
387
  images: None,
721
388
  pages: None,
389
+ djot_content: None,
390
+ elements: None,
722
391
  };
723
392
 
724
393
  let c_result = to_c_extraction_result(result);
@@ -766,6 +435,8 @@ mod tests {
766
435
  chunks: None,
767
436
  images: None,
768
437
  pages: None,
438
+ djot_content: None,
439
+ elements: None,
769
440
  };
770
441
 
771
442
  let c_result = to_c_extraction_result(result);
@@ -843,6 +514,8 @@ mod tests {
843
514
  chunks: Some(vec![chunk]),
844
515
  images: None,
845
516
  pages: None,
517
+ djot_content: None,
518
+ elements: None,
846
519
  };
847
520
 
848
521
  let c_result = to_c_extraction_result(result);