kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,385 @@
1
+ //! Centralized FFI configuration parsing module.
2
+ //!
3
+ //! This module consolidates all configuration parsing logic that was previously
4
+ //! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
5
+ //!
6
+ //! Instead of each binding reimplementing config parsing from JSON, they now
7
+ //! call the FFI functions provided here, ensuring:
8
+ //! - Single source of truth for validation rules
9
+ //! - Consistent behavior across all languages
10
+ //! - Elimination of drift/inconsistencies
11
+ //! - Better performance (no JSON round-trips in language bindings)
12
+
13
+ mod html;
14
+ mod loader;
15
+ mod merge;
16
+ mod parse;
17
+ mod serialize;
18
+
19
+ // Re-export key functions for internal use
20
+ pub use loader::{
21
+ discover_config_as_json, get_embedding_preset, list_embedding_presets, load_config_as_json, load_config_from_file,
22
+ };
23
+ pub use merge::merge_configs;
24
+ pub use parse::parse_extraction_config_from_json;
25
+ pub use serialize::{config_to_json_string, get_field_as_json, json_to_c_string};
26
+
27
+ use crate::ffi_panic_guard;
28
+ use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
29
+ use kreuzberg::core::config::ExtractionConfig;
30
+ use std::ffi::{CStr, CString};
31
+ use std::os::raw::c_char;
32
+ use std::path::Path;
33
+ use std::ptr;
34
+
35
+ /// Parse an ExtractionConfig from a JSON string.
36
+ ///
37
+ /// This is the primary FFI entry point for all language bindings to parse
38
+ /// configuration from JSON. Replaces the need for each binding to implement
39
+ /// its own JSON parsing logic.
40
+ ///
41
+ /// # Arguments
42
+ ///
43
+ /// * `json_config` - Null-terminated C string containing JSON configuration
44
+ ///
45
+ /// # Returns
46
+ ///
47
+ /// A pointer to an ExtractionConfig struct that MUST be freed with
48
+ /// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
49
+ ///
50
+ /// # Safety
51
+ ///
52
+ /// - `json_config` must be a valid null-terminated C string
53
+ /// - The returned pointer must be freed with `kreuzberg_config_free`
54
+ /// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
55
+ #[unsafe(no_mangle)]
56
+ pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
57
+ if json_config.is_null() {
58
+ set_last_error("Config JSON cannot be NULL".to_string());
59
+ return ptr::null_mut();
60
+ }
61
+
62
+ clear_last_error();
63
+
64
+ let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
65
+ Ok(s) => s,
66
+ Err(e) => {
67
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
68
+ return ptr::null_mut();
69
+ }
70
+ };
71
+
72
+ match parse_extraction_config_from_json(json_str) {
73
+ Ok(config) => Box::into_raw(Box::new(config)),
74
+ Err(e) => {
75
+ set_last_error(e);
76
+ ptr::null_mut()
77
+ }
78
+ }
79
+ }
80
+
81
+ /// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
82
+ ///
83
+ /// # Safety
84
+ ///
85
+ /// - `config` must be a pointer previously returned by a config creation function
86
+ /// - `config` can be NULL (no-op)
87
+ /// - `config` must not be used after this call
88
+ #[unsafe(no_mangle)]
89
+ pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
90
+ if !config.is_null() {
91
+ let _ = unsafe { Box::from_raw(config) };
92
+ }
93
+ }
94
+
95
+ /// Validate a JSON config string without parsing it.
96
+ ///
97
+ /// # Returns
98
+ ///
99
+ /// - 1 if valid (would parse successfully)
100
+ /// - 0 if invalid (check `kreuzberg_last_error` for details)
101
+ ///
102
+ /// # Safety
103
+ ///
104
+ /// - `json_config` must be a valid null-terminated C string
105
+ #[unsafe(no_mangle)]
106
+ pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
107
+ if json_config.is_null() {
108
+ set_last_error("Config JSON cannot be NULL".to_string());
109
+ return 0;
110
+ }
111
+
112
+ clear_last_error();
113
+
114
+ let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
115
+ Ok(s) => s,
116
+ Err(e) => {
117
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
118
+ return 0;
119
+ }
120
+ };
121
+
122
+ match parse_extraction_config_from_json(json_str) {
123
+ Ok(_) => 1,
124
+ Err(e) => {
125
+ set_last_error(e);
126
+ 0
127
+ }
128
+ }
129
+ }
130
+
131
+ /// Serialize an ExtractionConfig to JSON string.
132
+ ///
133
+ /// # Safety
134
+ ///
135
+ /// - `config` must be a valid pointer to an ExtractionConfig
136
+ /// - The returned pointer must be freed with `kreuzberg_free_string`
137
+ #[unsafe(no_mangle)]
138
+ pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
139
+ if config.is_null() {
140
+ set_last_error("Config cannot be NULL".to_string());
141
+ return ptr::null_mut();
142
+ }
143
+
144
+ clear_last_error();
145
+
146
+ match config_to_json_string(unsafe { &*config }) {
147
+ Some(json) => json_to_c_string(json),
148
+ None => ptr::null_mut(),
149
+ }
150
+ }
151
+
152
+ /// Get a specific field from config as JSON string.
153
+ ///
154
+ /// # Safety
155
+ ///
156
+ /// - `config` must be a valid pointer to an ExtractionConfig
157
+ /// - `field_name` must be a valid null-terminated C string
158
+ #[unsafe(no_mangle)]
159
+ pub unsafe extern "C" fn kreuzberg_config_get_field(
160
+ config: *const ExtractionConfig,
161
+ field_name: *const c_char,
162
+ ) -> *mut c_char {
163
+ if config.is_null() {
164
+ set_last_error("Config cannot be NULL".to_string());
165
+ return ptr::null_mut();
166
+ }
167
+
168
+ if field_name.is_null() {
169
+ set_last_error("Field name cannot be NULL".to_string());
170
+ return ptr::null_mut();
171
+ }
172
+
173
+ clear_last_error();
174
+
175
+ let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
176
+ Ok(s) => s,
177
+ Err(e) => {
178
+ set_last_error(format!("Invalid UTF-8 in field name: {}", e));
179
+ return ptr::null_mut();
180
+ }
181
+ };
182
+
183
+ match get_field_as_json(unsafe { &*config }, field_str) {
184
+ Some(json) => json_to_c_string(json),
185
+ None => ptr::null_mut(),
186
+ }
187
+ }
188
+
189
+ /// Merge two configs (override takes precedence over base).
190
+ ///
191
+ /// # Returns
192
+ ///
193
+ /// - 1 on success
194
+ /// - 0 on error (check `kreuzberg_last_error`)
195
+ ///
196
+ /// # Safety
197
+ ///
198
+ /// - `base` must be a valid mutable pointer to an ExtractionConfig
199
+ /// - `override_config` must be a valid pointer to an ExtractionConfig
200
+ #[unsafe(no_mangle)]
201
+ pub unsafe extern "C" fn kreuzberg_config_merge(
202
+ base: *mut ExtractionConfig,
203
+ override_config: *const ExtractionConfig,
204
+ ) -> i32 {
205
+ if base.is_null() {
206
+ set_last_error("Base config cannot be NULL".to_string());
207
+ return 0;
208
+ }
209
+
210
+ if override_config.is_null() {
211
+ set_last_error("Override config cannot be NULL".to_string());
212
+ return 0;
213
+ }
214
+
215
+ clear_last_error();
216
+
217
+ merge_configs(unsafe { &mut *base }, unsafe { &*override_config });
218
+
219
+ 1
220
+ }
221
+
222
+ /// Load an ExtractionConfig from a file (returns JSON string).
223
+ ///
224
+ /// # Safety
225
+ ///
226
+ /// - `file_path` must be a valid null-terminated C string
227
+ /// - The returned string must be freed with `kreuzberg_free_string`
228
+ #[unsafe(no_mangle)]
229
+ pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
230
+ ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
231
+ clear_last_error();
232
+
233
+ if file_path.is_null() {
234
+ set_last_error("file_path cannot be NULL".to_string());
235
+ return ptr::null_mut();
236
+ }
237
+
238
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
239
+ Ok(s) => s,
240
+ Err(e) => {
241
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
242
+ return ptr::null_mut();
243
+ }
244
+ };
245
+
246
+ match load_config_as_json(path_str) {
247
+ Ok(json) => match CString::new(json) {
248
+ Ok(cstr) => cstr.into_raw(),
249
+ Err(e) => {
250
+ set_last_error(format!("Failed to create C string: {}", e));
251
+ ptr::null_mut()
252
+ }
253
+ },
254
+ Err(e) => {
255
+ set_last_error(e);
256
+ ptr::null_mut()
257
+ }
258
+ }
259
+ })
260
+ }
261
+
262
+ /// Load an ExtractionConfig from a file (returns pointer to config struct).
263
+ ///
264
+ /// # Safety
265
+ ///
266
+ /// - `path` must be a valid null-terminated C string
267
+ /// - The returned pointer must be freed with `kreuzberg_config_free`
268
+ #[unsafe(no_mangle)]
269
+ pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
270
+ ffi_panic_guard!("kreuzberg_config_from_file", {
271
+ clear_last_error();
272
+
273
+ if path.is_null() {
274
+ set_last_error("Config path cannot be NULL".to_string());
275
+ return ptr::null_mut();
276
+ }
277
+
278
+ let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
279
+ Ok(s) => s,
280
+ Err(e) => {
281
+ set_last_error(format!("Invalid UTF-8 in config path: {}", e));
282
+ return ptr::null_mut();
283
+ }
284
+ };
285
+
286
+ let path_buf = Path::new(path_str);
287
+
288
+ match load_config_from_file(path_buf) {
289
+ Ok(config) => Box::into_raw(Box::new(config)),
290
+ Err(e) => {
291
+ set_last_error(e);
292
+ ptr::null_mut()
293
+ }
294
+ }
295
+ })
296
+ }
297
+
298
+ /// Discover and load an ExtractionConfig by searching parent directories.
299
+ ///
300
+ /// # Safety
301
+ ///
302
+ /// - The returned string must be freed with `kreuzberg_free_string`
303
+ #[unsafe(no_mangle)]
304
+ pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
305
+ ffi_panic_guard!("kreuzberg_config_discover", {
306
+ clear_last_error();
307
+
308
+ match discover_config_as_json() {
309
+ Some(json) => match CString::new(json) {
310
+ Ok(cstr) => cstr.into_raw(),
311
+ Err(e) => {
312
+ set_last_error(format!("Failed to serialize config: {}", e));
313
+ ptr::null_mut()
314
+ }
315
+ },
316
+ None => ptr::null_mut(),
317
+ }
318
+ })
319
+ }
320
+
321
+ /// List available embedding preset names.
322
+ ///
323
+ /// # Safety
324
+ ///
325
+ /// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
326
+ #[unsafe(no_mangle)]
327
+ pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
328
+ ffi_panic_guard!("kreuzberg_list_embedding_presets", {
329
+ clear_last_error();
330
+
331
+ match list_embedding_presets() {
332
+ Ok(json) => match string_to_c_string(json) {
333
+ Ok(ptr) => ptr,
334
+ Err(e) => {
335
+ set_last_error(e);
336
+ ptr::null_mut()
337
+ }
338
+ },
339
+ Err(e) => {
340
+ set_last_error(e);
341
+ ptr::null_mut()
342
+ }
343
+ }
344
+ })
345
+ }
346
+
347
+ /// Get a specific embedding preset by name.
348
+ ///
349
+ /// # Safety
350
+ ///
351
+ /// - `name` must be a valid null-terminated C string
352
+ /// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
353
+ #[unsafe(no_mangle)]
354
+ pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
355
+ ffi_panic_guard!("kreuzberg_get_embedding_preset", {
356
+ clear_last_error();
357
+
358
+ if name.is_null() {
359
+ set_last_error("preset name cannot be NULL".to_string());
360
+ return ptr::null_mut();
361
+ }
362
+
363
+ let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
364
+ Ok(s) => s,
365
+ Err(e) => {
366
+ set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
367
+ return ptr::null_mut();
368
+ }
369
+ };
370
+
371
+ match get_embedding_preset(preset_name) {
372
+ Ok(json) => match string_to_c_string(json) {
373
+ Ok(ptr) => ptr,
374
+ Err(e) => {
375
+ set_last_error(e);
376
+ ptr::null_mut()
377
+ }
378
+ },
379
+ Err(e) => {
380
+ set_last_error(e);
381
+ ptr::null_mut()
382
+ }
383
+ }
384
+ })
385
+ }
@@ -0,0 +1,91 @@
1
+ //! JSON parsing and validation for ExtractionConfig
2
+ //!
3
+ //! Handles deserialization from JSON strings with comprehensive validation.
4
+
5
+ use kreuzberg::core::config::ExtractionConfig;
6
+
7
+ type FfiResult<T> = std::result::Result<T, String>;
8
+
9
+ /// Parse an ExtractionConfig from a JSON string.
10
+ ///
11
+ /// This is the core parsing logic shared by all FFI functions that deal with
12
+ /// JSON configuration. It handles:
13
+ /// - JSON deserialization
14
+ /// - All validation rules
15
+ /// - Type conversions
16
+ /// - HTML options parsing (delegated to html module)
17
+ ///
18
+ /// The error messages are user-friendly and include guidance on what went wrong.
19
+ pub fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
20
+ let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
21
+
22
+ let mut config: ExtractionConfig =
23
+ serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
24
+
25
+ // Parse HTML options if present (complex nested structure)
26
+ if let Some(html_opts_val) = json_value.get("html_options") {
27
+ config.html_options = Some(super::html::parse_html_options(html_opts_val)?);
28
+ }
29
+
30
+ Ok(config)
31
+ }
32
+
33
+ #[cfg(test)]
34
+ mod tests {
35
+ use super::*;
36
+
37
+ #[test]
38
+ fn test_parse_minimal_config() {
39
+ let json = "{}";
40
+ let result = parse_extraction_config_from_json(json);
41
+ assert!(result.is_ok());
42
+ }
43
+
44
+ #[test]
45
+ fn test_parse_config_with_use_cache() {
46
+ let json = r#"{"use_cache": true}"#;
47
+ let result = parse_extraction_config_from_json(json);
48
+ assert!(result.is_ok());
49
+ let config = result.unwrap();
50
+ assert!(config.use_cache);
51
+ }
52
+
53
+ #[test]
54
+ fn test_parse_config_with_ocr() {
55
+ let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
56
+ let result = parse_extraction_config_from_json(json);
57
+ assert!(result.is_ok());
58
+ let config = result.unwrap();
59
+ assert!(config.ocr.is_some());
60
+ let ocr = config.ocr.unwrap();
61
+ assert_eq!(ocr.backend, "tesseract");
62
+ assert_eq!(ocr.language, "eng");
63
+ }
64
+
65
+ #[test]
66
+ fn test_parse_invalid_json() {
67
+ let json = "{invalid json}";
68
+ let result = parse_extraction_config_from_json(json);
69
+ assert!(result.is_err());
70
+ }
71
+
72
+ #[test]
73
+ fn test_parse_complex_config() {
74
+ let json = r#"{
75
+ "use_cache": true,
76
+ "enable_quality_processing": true,
77
+ "force_ocr": false,
78
+ "ocr": {
79
+ "backend": "tesseract",
80
+ "language": "eng"
81
+ },
82
+ "chunking": {
83
+ "max_chars": 1024,
84
+ "max_overlap": 128
85
+ },
86
+ "max_concurrent_extractions": 4
87
+ }"#;
88
+ let result = parse_extraction_config_from_json(json);
89
+ assert!(result.is_ok());
90
+ }
91
+ }
@@ -0,0 +1,118 @@
1
+ //! Serialization and field extraction helpers
2
+ //!
3
+ //! Utilities for converting ExtractionConfig to JSON and extracting specific fields.
4
+
5
+ use crate::helpers::set_last_error;
6
+ use kreuzberg::core::config::ExtractionConfig;
7
+ use serde::Serialize;
8
+ use std::ffi::CString;
9
+ use std::os::raw::c_char;
10
+ use std::ptr;
11
+
12
+ /// SerializableEmbeddingPreset for FFI serialization.
13
+ #[derive(Serialize)]
14
+ pub struct SerializableEmbeddingPreset<'a> {
15
+ pub name: &'a str,
16
+ pub chunk_size: usize,
17
+ pub overlap: usize,
18
+ pub model_name: String,
19
+ pub dimensions: usize,
20
+ pub description: &'a str,
21
+ }
22
+
23
+ /// Serialize an ExtractionConfig to JSON string.
24
+ ///
25
+ /// # Arguments
26
+ ///
27
+ /// * `config` - Reference to an ExtractionConfig
28
+ ///
29
+ /// # Returns
30
+ ///
31
+ /// JSON string on success, or None on error.
32
+ pub fn config_to_json_string(config: &ExtractionConfig) -> Option<String> {
33
+ serde_json::to_string(config).ok()
34
+ }
35
+
36
+ /// Convert a JSON value to C string pointer
37
+ pub fn json_to_c_string(json: String) -> *mut c_char {
38
+ match CString::new(json) {
39
+ Ok(c_string) => c_string.into_raw(),
40
+ Err(e) => {
41
+ set_last_error(format!("Failed to convert JSON to C string: {}", e));
42
+ ptr::null_mut()
43
+ }
44
+ }
45
+ }
46
+
47
+ /// Extract a specific field from config as JSON string.
48
+ ///
49
+ /// Supports dot notation for nested fields (e.g., "ocr.backend").
50
+ ///
51
+ /// # Arguments
52
+ ///
53
+ /// * `config` - Reference to an ExtractionConfig
54
+ /// * `field_path` - Dot-separated field path
55
+ ///
56
+ /// # Returns
57
+ ///
58
+ /// JSON string representation of the field value, or None if not found.
59
+ pub fn get_field_as_json(config: &ExtractionConfig, field_path: &str) -> Option<String> {
60
+ let json_value = match serde_json::to_value(config) {
61
+ Ok(val) => val,
62
+ Err(e) => {
63
+ set_last_error(format!("Failed to serialize config: {}", e));
64
+ return None;
65
+ }
66
+ };
67
+
68
+ let mut current = &json_value;
69
+ for part in field_path.split('.') {
70
+ if let Some(obj) = current.as_object() {
71
+ match obj.get(part) {
72
+ Some(val) => current = val,
73
+ None => {
74
+ set_last_error(format!("Field '{}' not found in config", field_path));
75
+ return None;
76
+ }
77
+ }
78
+ } else {
79
+ set_last_error(format!("Cannot access nested field '{}' in non-object", part));
80
+ return None;
81
+ }
82
+ }
83
+
84
+ match serde_json::to_string(current) {
85
+ Ok(json) => Some(json),
86
+ Err(e) => {
87
+ set_last_error(format!("Failed to serialize field value: {}", e));
88
+ None
89
+ }
90
+ }
91
+ }
92
+
93
+ #[cfg(test)]
94
+ mod tests {
95
+ use super::*;
96
+
97
+ #[test]
98
+ fn test_config_to_json_string() {
99
+ let config = ExtractionConfig {
100
+ use_cache: true,
101
+ ..Default::default()
102
+ };
103
+ let json = config_to_json_string(&config);
104
+ assert!(json.is_some());
105
+ assert!(json.unwrap().contains("use_cache"));
106
+ }
107
+
108
+ #[test]
109
+ fn test_get_field_as_json() {
110
+ let config = ExtractionConfig {
111
+ use_cache: true,
112
+ ..Default::default()
113
+ };
114
+ let result = get_field_as_json(&config, "use_cache");
115
+ assert!(result.is_some());
116
+ assert_eq!(result.unwrap(), "true");
117
+ }
118
+ }