kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,385 @@
1
+ //! Centralized FFI configuration parsing module.
2
+ //!
3
+ //! This module consolidates all configuration parsing logic that was previously
4
+ //! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
5
+ //!
6
+ //! Instead of each binding reimplementing config parsing from JSON, they now
7
+ //! call the FFI functions provided here, ensuring:
8
+ //! - Single source of truth for validation rules
9
+ //! - Consistent behavior across all languages
10
+ //! - Elimination of drift/inconsistencies
11
+ //! - Better performance (no JSON round-trips in language bindings)
12
+
13
+ mod html;
14
+ mod loader;
15
+ mod merge;
16
+ mod parse;
17
+ mod serialize;
18
+
19
+ // Re-export key functions for internal use
20
+ pub use loader::{
21
+ discover_config_as_json, get_embedding_preset, list_embedding_presets, load_config_as_json, load_config_from_file,
22
+ };
23
+ pub use merge::merge_configs;
24
+ pub use parse::parse_extraction_config_from_json;
25
+ pub use serialize::{config_to_json_string, get_field_as_json, json_to_c_string};
26
+
27
+ use crate::ffi_panic_guard;
28
+ use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
29
+ use kreuzberg::core::config::ExtractionConfig;
30
+ use std::ffi::{CStr, CString};
31
+ use std::os::raw::c_char;
32
+ use std::path::Path;
33
+ use std::ptr;
34
+
35
+ /// Parse an ExtractionConfig from a JSON string.
36
+ ///
37
+ /// This is the primary FFI entry point for all language bindings to parse
38
+ /// configuration from JSON. Replaces the need for each binding to implement
39
+ /// its own JSON parsing logic.
40
+ ///
41
+ /// # Arguments
42
+ ///
43
+ /// * `json_config` - Null-terminated C string containing JSON configuration
44
+ ///
45
+ /// # Returns
46
+ ///
47
+ /// A pointer to an ExtractionConfig struct that MUST be freed with
48
+ /// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
49
+ ///
50
+ /// # Safety
51
+ ///
52
+ /// - `json_config` must be a valid null-terminated C string
53
+ /// - The returned pointer must be freed with `kreuzberg_config_free`
54
+ /// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
55
+ #[unsafe(no_mangle)]
56
+ pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
57
+ if json_config.is_null() {
58
+ set_last_error("Config JSON cannot be NULL".to_string());
59
+ return ptr::null_mut();
60
+ }
61
+
62
+ clear_last_error();
63
+
64
+ let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
65
+ Ok(s) => s,
66
+ Err(e) => {
67
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
68
+ return ptr::null_mut();
69
+ }
70
+ };
71
+
72
+ match parse_extraction_config_from_json(json_str) {
73
+ Ok(config) => Box::into_raw(Box::new(config)),
74
+ Err(e) => {
75
+ set_last_error(e);
76
+ ptr::null_mut()
77
+ }
78
+ }
79
+ }
80
+
81
+ /// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
82
+ ///
83
+ /// # Safety
84
+ ///
85
+ /// - `config` must be a pointer previously returned by a config creation function
86
+ /// - `config` can be NULL (no-op)
87
+ /// - `config` must not be used after this call
88
+ #[unsafe(no_mangle)]
89
+ pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
90
+ if !config.is_null() {
91
+ let _ = unsafe { Box::from_raw(config) };
92
+ }
93
+ }
94
+
95
+ /// Validate a JSON config string without parsing it.
96
+ ///
97
+ /// # Returns
98
+ ///
99
+ /// - 1 if valid (would parse successfully)
100
+ /// - 0 if invalid (check `kreuzberg_last_error` for details)
101
+ ///
102
+ /// # Safety
103
+ ///
104
+ /// - `json_config` must be a valid null-terminated C string
105
+ #[unsafe(no_mangle)]
106
+ pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
107
+ if json_config.is_null() {
108
+ set_last_error("Config JSON cannot be NULL".to_string());
109
+ return 0;
110
+ }
111
+
112
+ clear_last_error();
113
+
114
+ let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
115
+ Ok(s) => s,
116
+ Err(e) => {
117
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
118
+ return 0;
119
+ }
120
+ };
121
+
122
+ match parse_extraction_config_from_json(json_str) {
123
+ Ok(_) => 1,
124
+ Err(e) => {
125
+ set_last_error(e);
126
+ 0
127
+ }
128
+ }
129
+ }
130
+
131
+ /// Serialize an ExtractionConfig to JSON string.
132
+ ///
133
+ /// # Safety
134
+ ///
135
+ /// - `config` must be a valid pointer to an ExtractionConfig
136
+ /// - The returned pointer must be freed with `kreuzberg_free_string`
137
+ #[unsafe(no_mangle)]
138
+ pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
139
+ if config.is_null() {
140
+ set_last_error("Config cannot be NULL".to_string());
141
+ return ptr::null_mut();
142
+ }
143
+
144
+ clear_last_error();
145
+
146
+ match config_to_json_string(unsafe { &*config }) {
147
+ Some(json) => json_to_c_string(json),
148
+ None => ptr::null_mut(),
149
+ }
150
+ }
151
+
152
+ /// Get a specific field from config as JSON string.
153
+ ///
154
+ /// # Safety
155
+ ///
156
+ /// - `config` must be a valid pointer to an ExtractionConfig
157
+ /// - `field_name` must be a valid null-terminated C string
158
+ #[unsafe(no_mangle)]
159
+ pub unsafe extern "C" fn kreuzberg_config_get_field(
160
+ config: *const ExtractionConfig,
161
+ field_name: *const c_char,
162
+ ) -> *mut c_char {
163
+ if config.is_null() {
164
+ set_last_error("Config cannot be NULL".to_string());
165
+ return ptr::null_mut();
166
+ }
167
+
168
+ if field_name.is_null() {
169
+ set_last_error("Field name cannot be NULL".to_string());
170
+ return ptr::null_mut();
171
+ }
172
+
173
+ clear_last_error();
174
+
175
+ let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
176
+ Ok(s) => s,
177
+ Err(e) => {
178
+ set_last_error(format!("Invalid UTF-8 in field name: {}", e));
179
+ return ptr::null_mut();
180
+ }
181
+ };
182
+
183
+ match get_field_as_json(unsafe { &*config }, field_str) {
184
+ Some(json) => json_to_c_string(json),
185
+ None => ptr::null_mut(),
186
+ }
187
+ }
188
+
189
+ /// Merge two configs (override takes precedence over base).
190
+ ///
191
+ /// # Returns
192
+ ///
193
+ /// - 1 on success
194
+ /// - 0 on error (check `kreuzberg_last_error`)
195
+ ///
196
+ /// # Safety
197
+ ///
198
+ /// - `base` must be a valid mutable pointer to an ExtractionConfig
199
+ /// - `override_config` must be a valid pointer to an ExtractionConfig
200
+ #[unsafe(no_mangle)]
201
+ pub unsafe extern "C" fn kreuzberg_config_merge(
202
+ base: *mut ExtractionConfig,
203
+ override_config: *const ExtractionConfig,
204
+ ) -> i32 {
205
+ if base.is_null() {
206
+ set_last_error("Base config cannot be NULL".to_string());
207
+ return 0;
208
+ }
209
+
210
+ if override_config.is_null() {
211
+ set_last_error("Override config cannot be NULL".to_string());
212
+ return 0;
213
+ }
214
+
215
+ clear_last_error();
216
+
217
+ merge_configs(unsafe { &mut *base }, unsafe { &*override_config });
218
+
219
+ 1
220
+ }
221
+
222
+ /// Load an ExtractionConfig from a file (returns JSON string).
223
+ ///
224
+ /// # Safety
225
+ ///
226
+ /// - `file_path` must be a valid null-terminated C string
227
+ /// - The returned string must be freed with `kreuzberg_free_string`
228
+ #[unsafe(no_mangle)]
229
+ pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
230
+ ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
231
+ clear_last_error();
232
+
233
+ if file_path.is_null() {
234
+ set_last_error("file_path cannot be NULL".to_string());
235
+ return ptr::null_mut();
236
+ }
237
+
238
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
239
+ Ok(s) => s,
240
+ Err(e) => {
241
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
242
+ return ptr::null_mut();
243
+ }
244
+ };
245
+
246
+ match load_config_as_json(path_str) {
247
+ Ok(json) => match CString::new(json) {
248
+ Ok(cstr) => cstr.into_raw(),
249
+ Err(e) => {
250
+ set_last_error(format!("Failed to create C string: {}", e));
251
+ ptr::null_mut()
252
+ }
253
+ },
254
+ Err(e) => {
255
+ set_last_error(e);
256
+ ptr::null_mut()
257
+ }
258
+ }
259
+ })
260
+ }
261
+
262
+ /// Load an ExtractionConfig from a file (returns pointer to config struct).
263
+ ///
264
+ /// # Safety
265
+ ///
266
+ /// - `path` must be a valid null-terminated C string
267
+ /// - The returned pointer must be freed with `kreuzberg_config_free`
268
+ #[unsafe(no_mangle)]
269
+ pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
270
+ ffi_panic_guard!("kreuzberg_config_from_file", {
271
+ clear_last_error();
272
+
273
+ if path.is_null() {
274
+ set_last_error("Config path cannot be NULL".to_string());
275
+ return ptr::null_mut();
276
+ }
277
+
278
+ let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
279
+ Ok(s) => s,
280
+ Err(e) => {
281
+ set_last_error(format!("Invalid UTF-8 in config path: {}", e));
282
+ return ptr::null_mut();
283
+ }
284
+ };
285
+
286
+ let path_buf = Path::new(path_str);
287
+
288
+ match load_config_from_file(path_buf) {
289
+ Ok(config) => Box::into_raw(Box::new(config)),
290
+ Err(e) => {
291
+ set_last_error(e);
292
+ ptr::null_mut()
293
+ }
294
+ }
295
+ })
296
+ }
297
+
298
+ /// Discover and load an ExtractionConfig by searching parent directories.
299
+ ///
300
+ /// # Safety
301
+ ///
302
+ /// - The returned string must be freed with `kreuzberg_free_string`
303
+ #[unsafe(no_mangle)]
304
+ pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
305
+ ffi_panic_guard!("kreuzberg_config_discover", {
306
+ clear_last_error();
307
+
308
+ match discover_config_as_json() {
309
+ Some(json) => match CString::new(json) {
310
+ Ok(cstr) => cstr.into_raw(),
311
+ Err(e) => {
312
+ set_last_error(format!("Failed to serialize config: {}", e));
313
+ ptr::null_mut()
314
+ }
315
+ },
316
+ None => ptr::null_mut(),
317
+ }
318
+ })
319
+ }
320
+
321
+ /// List available embedding preset names.
322
+ ///
323
+ /// # Safety
324
+ ///
325
+ /// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
326
+ #[unsafe(no_mangle)]
327
+ pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
328
+ ffi_panic_guard!("kreuzberg_list_embedding_presets", {
329
+ clear_last_error();
330
+
331
+ match list_embedding_presets() {
332
+ Ok(json) => match string_to_c_string(json) {
333
+ Ok(ptr) => ptr,
334
+ Err(e) => {
335
+ set_last_error(e);
336
+ ptr::null_mut()
337
+ }
338
+ },
339
+ Err(e) => {
340
+ set_last_error(e);
341
+ ptr::null_mut()
342
+ }
343
+ }
344
+ })
345
+ }
346
+
347
+ /// Get a specific embedding preset by name.
348
+ ///
349
+ /// # Safety
350
+ ///
351
+ /// - `name` must be a valid null-terminated C string
352
+ /// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
353
+ #[unsafe(no_mangle)]
354
+ pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
355
+ ffi_panic_guard!("kreuzberg_get_embedding_preset", {
356
+ clear_last_error();
357
+
358
+ if name.is_null() {
359
+ set_last_error("preset name cannot be NULL".to_string());
360
+ return ptr::null_mut();
361
+ }
362
+
363
+ let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
364
+ Ok(s) => s,
365
+ Err(e) => {
366
+ set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
367
+ return ptr::null_mut();
368
+ }
369
+ };
370
+
371
+ match get_embedding_preset(preset_name) {
372
+ Ok(json) => match string_to_c_string(json) {
373
+ Ok(ptr) => ptr,
374
+ Err(e) => {
375
+ set_last_error(e);
376
+ ptr::null_mut()
377
+ }
378
+ },
379
+ Err(e) => {
380
+ set_last_error(e);
381
+ ptr::null_mut()
382
+ }
383
+ }
384
+ })
385
+ }
@@ -0,0 +1,91 @@
1
+ //! JSON parsing and validation for ExtractionConfig
2
+ //!
3
+ //! Handles deserialization from JSON strings with comprehensive validation.
4
+
5
+ use kreuzberg::core::config::ExtractionConfig;
6
+
7
+ type FfiResult<T> = std::result::Result<T, String>;
8
+
9
+ /// Parse an ExtractionConfig from a JSON string.
10
+ ///
11
+ /// This is the core parsing logic shared by all FFI functions that deal with
12
+ /// JSON configuration. It handles:
13
+ /// - JSON deserialization
14
+ /// - All validation rules
15
+ /// - Type conversions
16
+ /// - HTML options parsing (delegated to html module)
17
+ ///
18
+ /// The error messages are user-friendly and include guidance on what went wrong.
19
+ pub fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
20
+ let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
21
+
22
+ let mut config: ExtractionConfig =
23
+ serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
24
+
25
+ // Parse HTML options if present (complex nested structure)
26
+ if let Some(html_opts_val) = json_value.get("html_options") {
27
+ config.html_options = Some(super::html::parse_html_options(html_opts_val)?);
28
+ }
29
+
30
+ Ok(config)
31
+ }
32
+
33
+ #[cfg(test)]
34
+ mod tests {
35
+ use super::*;
36
+
37
+ #[test]
38
+ fn test_parse_minimal_config() {
39
+ let json = "{}";
40
+ let result = parse_extraction_config_from_json(json);
41
+ assert!(result.is_ok());
42
+ }
43
+
44
+ #[test]
45
+ fn test_parse_config_with_use_cache() {
46
+ let json = r#"{"use_cache": true}"#;
47
+ let result = parse_extraction_config_from_json(json);
48
+ assert!(result.is_ok());
49
+ let config = result.unwrap();
50
+ assert!(config.use_cache);
51
+ }
52
+
53
+ #[test]
54
+ fn test_parse_config_with_ocr() {
55
+ let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
56
+ let result = parse_extraction_config_from_json(json);
57
+ assert!(result.is_ok());
58
+ let config = result.unwrap();
59
+ assert!(config.ocr.is_some());
60
+ let ocr = config.ocr.unwrap();
61
+ assert_eq!(ocr.backend, "tesseract");
62
+ assert_eq!(ocr.language, "eng");
63
+ }
64
+
65
+ #[test]
66
+ fn test_parse_invalid_json() {
67
+ let json = "{invalid json}";
68
+ let result = parse_extraction_config_from_json(json);
69
+ assert!(result.is_err());
70
+ }
71
+
72
+ #[test]
73
+ fn test_parse_complex_config() {
74
+ let json = r#"{
75
+ "use_cache": true,
76
+ "enable_quality_processing": true,
77
+ "force_ocr": false,
78
+ "ocr": {
79
+ "backend": "tesseract",
80
+ "language": "eng"
81
+ },
82
+ "chunking": {
83
+ "max_chars": 1024,
84
+ "max_overlap": 128
85
+ },
86
+ "max_concurrent_extractions": 4
87
+ }"#;
88
+ let result = parse_extraction_config_from_json(json);
89
+ assert!(result.is_ok());
90
+ }
91
+ }
@@ -0,0 +1,118 @@
1
+ //! Serialization and field extraction helpers
2
+ //!
3
+ //! Utilities for converting ExtractionConfig to JSON and extracting specific fields.
4
+
5
+ use crate::helpers::set_last_error;
6
+ use kreuzberg::core::config::ExtractionConfig;
7
+ use serde::Serialize;
8
+ use std::ffi::CString;
9
+ use std::os::raw::c_char;
10
+ use std::ptr;
11
+
12
+ /// SerializableEmbeddingPreset for FFI serialization.
13
+ #[derive(Serialize)]
14
+ pub struct SerializableEmbeddingPreset<'a> {
15
+ pub name: &'a str,
16
+ pub chunk_size: usize,
17
+ pub overlap: usize,
18
+ pub model_name: String,
19
+ pub dimensions: usize,
20
+ pub description: &'a str,
21
+ }
22
+
23
+ /// Serialize an ExtractionConfig to JSON string.
24
+ ///
25
+ /// # Arguments
26
+ ///
27
+ /// * `config` - Reference to an ExtractionConfig
28
+ ///
29
+ /// # Returns
30
+ ///
31
+ /// JSON string on success, or None on error.
32
+ pub fn config_to_json_string(config: &ExtractionConfig) -> Option<String> {
33
+ serde_json::to_string(config).ok()
34
+ }
35
+
36
+ /// Convert a JSON value to C string pointer
37
+ pub fn json_to_c_string(json: String) -> *mut c_char {
38
+ match CString::new(json) {
39
+ Ok(c_string) => c_string.into_raw(),
40
+ Err(e) => {
41
+ set_last_error(format!("Failed to convert JSON to C string: {}", e));
42
+ ptr::null_mut()
43
+ }
44
+ }
45
+ }
46
+
47
+ /// Extract a specific field from config as JSON string.
48
+ ///
49
+ /// Supports dot notation for nested fields (e.g., "ocr.backend").
50
+ ///
51
+ /// # Arguments
52
+ ///
53
+ /// * `config` - Reference to an ExtractionConfig
54
+ /// * `field_path` - Dot-separated field path
55
+ ///
56
+ /// # Returns
57
+ ///
58
+ /// JSON string representation of the field value, or None if not found.
59
+ pub fn get_field_as_json(config: &ExtractionConfig, field_path: &str) -> Option<String> {
60
+ let json_value = match serde_json::to_value(config) {
61
+ Ok(val) => val,
62
+ Err(e) => {
63
+ set_last_error(format!("Failed to serialize config: {}", e));
64
+ return None;
65
+ }
66
+ };
67
+
68
+ let mut current = &json_value;
69
+ for part in field_path.split('.') {
70
+ if let Some(obj) = current.as_object() {
71
+ match obj.get(part) {
72
+ Some(val) => current = val,
73
+ None => {
74
+ set_last_error(format!("Field '{}' not found in config", field_path));
75
+ return None;
76
+ }
77
+ }
78
+ } else {
79
+ set_last_error(format!("Cannot access nested field '{}' in non-object", part));
80
+ return None;
81
+ }
82
+ }
83
+
84
+ match serde_json::to_string(current) {
85
+ Ok(json) => Some(json),
86
+ Err(e) => {
87
+ set_last_error(format!("Failed to serialize field value: {}", e));
88
+ None
89
+ }
90
+ }
91
+ }
92
+
93
+ #[cfg(test)]
94
+ mod tests {
95
+ use super::*;
96
+
97
+ #[test]
98
+ fn test_config_to_json_string() {
99
+ let config = ExtractionConfig {
100
+ use_cache: true,
101
+ ..Default::default()
102
+ };
103
+ let json = config_to_json_string(&config);
104
+ assert!(json.is_some());
105
+ assert!(json.unwrap().contains("use_cache"));
106
+ }
107
+
108
+ #[test]
109
+ fn test_get_field_as_json() {
110
+ let config = ExtractionConfig {
111
+ use_cache: true,
112
+ ..Default::default()
113
+ };
114
+ let result = get_field_as_json(&config, "use_cache");
115
+ assert!(result.is_some());
116
+ assert_eq!(result.unwrap(), "true");
117
+ }
118
+ }