kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,318 @@
1
+ //! HTML options parsing from JSON
2
+ //!
3
+ //! Handles the complex nested structure of HTML conversion options.
4
+
5
+ use html_to_markdown_rs::options::{
6
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
7
+ WhitespaceMode,
8
+ };
9
+
10
+ type FfiResult<T> = std::result::Result<T, String>;
11
+
12
+ /// Parse enum value from optional JSON value
13
+ fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
14
+ where
15
+ F: Fn(&str) -> FfiResult<T>,
16
+ {
17
+ if let Some(raw) = value {
18
+ let text = raw
19
+ .as_str()
20
+ .ok_or_else(|| "Expected string for enum field".to_string())?;
21
+ return parse_fn(text).map(Some);
22
+ }
23
+ Ok(None)
24
+ }
25
+
26
+ /// Parse HeadingStyle from string
27
+ fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
28
+ match value.to_lowercase().as_str() {
29
+ "atx" => Ok(HeadingStyle::Atx),
30
+ "underlined" => Ok(HeadingStyle::Underlined),
31
+ "atx_closed" => Ok(HeadingStyle::AtxClosed),
32
+ other => Err(format!(
33
+ "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
34
+ other
35
+ )),
36
+ }
37
+ }
38
+
39
+ /// Parse ListIndentType from string
40
+ fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
41
+ match value.to_lowercase().as_str() {
42
+ "spaces" => Ok(ListIndentType::Spaces),
43
+ "tabs" => Ok(ListIndentType::Tabs),
44
+ other => Err(format!(
45
+ "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
46
+ other
47
+ )),
48
+ }
49
+ }
50
+
51
+ /// Parse HighlightStyle from string
52
+ fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
53
+ match value.to_lowercase().as_str() {
54
+ "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
55
+ "html" => Ok(HighlightStyle::Html),
56
+ "bold" => Ok(HighlightStyle::Bold),
57
+ "none" => Ok(HighlightStyle::None),
58
+ other => Err(format!(
59
+ "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
60
+ other
61
+ )),
62
+ }
63
+ }
64
+
65
+ /// Parse WhitespaceMode from string
66
+ fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
67
+ match value.to_lowercase().as_str() {
68
+ "normalized" => Ok(WhitespaceMode::Normalized),
69
+ "strict" => Ok(WhitespaceMode::Strict),
70
+ other => Err(format!(
71
+ "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
72
+ other
73
+ )),
74
+ }
75
+ }
76
+
77
+ /// Parse NewlineStyle from string
78
+ fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
79
+ match value.to_lowercase().as_str() {
80
+ "spaces" => Ok(NewlineStyle::Spaces),
81
+ "backslash" => Ok(NewlineStyle::Backslash),
82
+ other => Err(format!(
83
+ "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
84
+ other
85
+ )),
86
+ }
87
+ }
88
+
89
+ /// Parse CodeBlockStyle from string
90
+ fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
91
+ match value.to_lowercase().as_str() {
92
+ "indented" => Ok(CodeBlockStyle::Indented),
93
+ "backticks" => Ok(CodeBlockStyle::Backticks),
94
+ "tildes" => Ok(CodeBlockStyle::Tildes),
95
+ other => Err(format!(
96
+ "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
97
+ other
98
+ )),
99
+ }
100
+ }
101
+
102
+ /// Parse PreprocessingPreset from string
103
+ #[allow(dead_code)]
104
+ fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
105
+ match value.to_lowercase().as_str() {
106
+ "minimal" => Ok(PreprocessingPreset::Minimal),
107
+ "standard" => Ok(PreprocessingPreset::Standard),
108
+ "aggressive" => Ok(PreprocessingPreset::Aggressive),
109
+ other => Err(format!(
110
+ "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
111
+ other
112
+ )),
113
+ }
114
+ }
115
+
116
+ /// Parse HTML conversion options from JSON value
117
+ pub fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
118
+ let mut opts = ConversionOptions::default();
119
+ let obj = value
120
+ .as_object()
121
+ .ok_or_else(|| "html_options must be an object".to_string())?;
122
+
123
+ if let Some(val) = obj.get("heading_style") {
124
+ opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
125
+ }
126
+
127
+ if let Some(val) = obj.get("list_indent_type") {
128
+ opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
129
+ }
130
+
131
+ if let Some(val) = obj.get("list_indent_width") {
132
+ opts.list_indent_width = val
133
+ .as_u64()
134
+ .map(|v| v as usize)
135
+ .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
136
+ }
137
+
138
+ if let Some(val) = obj.get("bullets") {
139
+ opts.bullets = val
140
+ .as_str()
141
+ .map(str::to_string)
142
+ .ok_or_else(|| "bullets must be a string".to_string())?;
143
+ }
144
+
145
+ if let Some(val) = obj.get("strong_em_symbol") {
146
+ let symbol = val
147
+ .as_str()
148
+ .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
149
+ let mut chars = symbol.chars();
150
+ opts.strong_em_symbol = chars
151
+ .next()
152
+ .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
153
+ }
154
+
155
+ if let Some(val) = obj.get("escape_asterisks") {
156
+ opts.escape_asterisks = val
157
+ .as_bool()
158
+ .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
159
+ }
160
+
161
+ if let Some(val) = obj.get("escape_underscores") {
162
+ opts.escape_underscores = val
163
+ .as_bool()
164
+ .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
165
+ }
166
+
167
+ if let Some(val) = obj.get("escape_misc") {
168
+ opts.escape_misc = val
169
+ .as_bool()
170
+ .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
171
+ }
172
+
173
+ if let Some(val) = obj.get("escape_ascii") {
174
+ opts.escape_ascii = val
175
+ .as_bool()
176
+ .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
177
+ }
178
+
179
+ if let Some(val) = obj.get("code_language") {
180
+ opts.code_language = val
181
+ .as_str()
182
+ .map(str::to_string)
183
+ .ok_or_else(|| "code_language must be a string".to_string())?;
184
+ }
185
+
186
+ if let Some(val) = obj.get("autolinks") {
187
+ opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
188
+ }
189
+
190
+ if let Some(val) = obj.get("default_title") {
191
+ opts.default_title = val
192
+ .as_bool()
193
+ .ok_or_else(|| "default_title must be a boolean".to_string())?;
194
+ }
195
+
196
+ if let Some(val) = obj.get("br_in_tables") {
197
+ opts.br_in_tables = val
198
+ .as_bool()
199
+ .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
200
+ }
201
+
202
+ if let Some(val) = obj.get("hocr_spatial_tables") {
203
+ opts.hocr_spatial_tables = val
204
+ .as_bool()
205
+ .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
206
+ }
207
+
208
+ if let Some(val) = obj.get("highlight_style") {
209
+ opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
210
+ }
211
+
212
+ if let Some(val) = obj.get("extract_metadata") {
213
+ opts.extract_metadata = val
214
+ .as_bool()
215
+ .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
216
+ }
217
+
218
+ if let Some(val) = obj.get("whitespace_mode") {
219
+ opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
220
+ }
221
+
222
+ if let Some(val) = obj.get("strip_newlines") {
223
+ opts.strip_newlines = val
224
+ .as_bool()
225
+ .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
226
+ }
227
+
228
+ if let Some(val) = obj.get("wrap") {
229
+ opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
230
+ }
231
+
232
+ if let Some(val) = obj.get("wrap_width") {
233
+ opts.wrap_width = val
234
+ .as_u64()
235
+ .map(|v| v as usize)
236
+ .ok_or_else(|| "wrap_width must be an integer".to_string())?;
237
+ }
238
+
239
+ if let Some(val) = obj.get("convert_as_inline") {
240
+ opts.convert_as_inline = val
241
+ .as_bool()
242
+ .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
243
+ }
244
+
245
+ if let Some(val) = obj.get("sub_symbol") {
246
+ opts.sub_symbol = val
247
+ .as_str()
248
+ .map(str::to_string)
249
+ .ok_or_else(|| "sub_symbol must be a string".to_string())?;
250
+ }
251
+
252
+ if let Some(val) = obj.get("sup_symbol") {
253
+ opts.sup_symbol = val
254
+ .as_str()
255
+ .map(str::to_string)
256
+ .ok_or_else(|| "sup_symbol must be a string".to_string())?;
257
+ }
258
+
259
+ if let Some(val) = obj.get("newline_style") {
260
+ opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
261
+ }
262
+
263
+ if let Some(val) = obj.get("code_block_style") {
264
+ opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
265
+ }
266
+
267
+ if let Some(val) = obj.get("keep_inline_images_in") {
268
+ opts.keep_inline_images_in = val
269
+ .as_array()
270
+ .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
271
+ .iter()
272
+ .map(|v| {
273
+ v.as_str()
274
+ .map(str::to_string)
275
+ .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
276
+ })
277
+ .collect::<FfiResult<Vec<_>>>()?;
278
+ }
279
+
280
+ if let Some(val) = obj.get("encoding") {
281
+ opts.encoding = val
282
+ .as_str()
283
+ .map(str::to_string)
284
+ .ok_or_else(|| "encoding must be a string".to_string())?;
285
+ }
286
+
287
+ if let Some(val) = obj.get("debug") {
288
+ opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
289
+ }
290
+
291
+ if let Some(val) = obj.get("strip_tags") {
292
+ opts.strip_tags = val
293
+ .as_array()
294
+ .ok_or_else(|| "strip_tags must be an array".to_string())?
295
+ .iter()
296
+ .map(|v| {
297
+ v.as_str()
298
+ .map(str::to_string)
299
+ .ok_or_else(|| "strip_tags entries must be strings".to_string())
300
+ })
301
+ .collect::<FfiResult<Vec<_>>>()?;
302
+ }
303
+
304
+ if let Some(val) = obj.get("preserve_tags") {
305
+ opts.preserve_tags = val
306
+ .as_array()
307
+ .ok_or_else(|| "preserve_tags must be an array".to_string())?
308
+ .iter()
309
+ .map(|v| {
310
+ v.as_str()
311
+ .map(str::to_string)
312
+ .ok_or_else(|| "preserve_tags entries must be strings".to_string())
313
+ })
314
+ .collect::<FfiResult<Vec<_>>>()?;
315
+ }
316
+
317
+ Ok(opts)
318
+ }
@@ -0,0 +1,154 @@
1
+ //! Configuration loading from files
2
+ //!
3
+ //! Handles loading ExtractionConfig from TOML/JSON/YAML files and discovery.
4
+
5
+ use crate::helpers::set_last_error;
6
+ use kreuzberg::KreuzbergError;
7
+ use kreuzberg::core::config::ExtractionConfig;
8
+ use std::path::Path;
9
+
10
+ /// Load an ExtractionConfig from a file (returns JSON string).
11
+ ///
12
+ /// # Arguments
13
+ ///
14
+ /// * `file_path` - Path to the configuration file
15
+ ///
16
+ /// # Returns
17
+ ///
18
+ /// JSON string representation of the config, or error message.
19
+ pub fn load_config_as_json(file_path: &str) -> Result<String, String> {
20
+ match ExtractionConfig::from_file(file_path) {
21
+ Ok(config) => match serde_json::to_string(&config) {
22
+ Ok(json) => Ok(json),
23
+ Err(e) => Err(format!("Failed to serialize config to JSON: {}", e)),
24
+ },
25
+ Err(e) => Err(e.to_string()),
26
+ }
27
+ }
28
+
29
+ /// Load an ExtractionConfig from a file (returns config struct).
30
+ ///
31
+ /// # Arguments
32
+ ///
33
+ /// * `path` - Path to the configuration file
34
+ ///
35
+ /// # Returns
36
+ ///
37
+ /// ExtractionConfig on success, or error message.
38
+ pub fn load_config_from_file(path: &Path) -> Result<ExtractionConfig, String> {
39
+ match ExtractionConfig::from_file(path) {
40
+ Ok(config) => Ok(config),
41
+ Err(e) => match &e {
42
+ KreuzbergError::Io(io_err) => Err(format!("IO error loading config: {}", io_err)),
43
+ _ => Err(format!("Failed to load config from file: {}", e)),
44
+ },
45
+ }
46
+ }
47
+
48
+ /// Discover and load an ExtractionConfig (returns JSON string).
49
+ ///
50
+ /// Searches the current directory and all parent directories for:
51
+ /// - `kreuzberg.toml`
52
+ /// - `kreuzberg.json`
53
+ ///
54
+ /// # Returns
55
+ ///
56
+ /// JSON string of the first config file found, or None if not found.
57
+ pub fn discover_config_as_json() -> Option<String> {
58
+ match ExtractionConfig::discover() {
59
+ Ok(Some(config)) => match serde_json::to_string(&config) {
60
+ Ok(json) => Some(json),
61
+ Err(e) => {
62
+ set_last_error(format!("Failed to serialize config: {}", e));
63
+ None
64
+ }
65
+ },
66
+ Ok(None) => None,
67
+ Err(e) => {
68
+ match &e {
69
+ KreuzbergError::Io(io_err) => {
70
+ set_last_error(format!("IO error discovering config: {}", io_err));
71
+ }
72
+ _ => {
73
+ set_last_error(format!("Failed to discover config: {}", e));
74
+ }
75
+ }
76
+ None
77
+ }
78
+ }
79
+ }
80
+
81
+ /// List available embedding preset names.
82
+ ///
83
+ /// # Returns
84
+ ///
85
+ /// JSON array of preset names, or error message.
86
+ pub fn list_embedding_presets() -> Result<String, String> {
87
+ let presets = kreuzberg::embeddings::list_presets();
88
+ match serde_json::to_string(&presets) {
89
+ Ok(json) => Ok(json),
90
+ Err(e) => Err(format!("Failed to serialize presets: {}", e)),
91
+ }
92
+ }
93
+
94
+ /// Get a specific embedding preset by name.
95
+ ///
96
+ /// # Arguments
97
+ ///
98
+ /// * `preset_name` - Name of the preset to retrieve
99
+ ///
100
+ /// # Returns
101
+ ///
102
+ /// JSON representation of the preset, or error message.
103
+ pub fn get_embedding_preset(preset_name: &str) -> Result<String, String> {
104
+ let preset = match kreuzberg::embeddings::get_preset(preset_name) {
105
+ Some(preset) => preset,
106
+ None => {
107
+ return Err(format!("Unknown embedding preset: {}", preset_name));
108
+ }
109
+ };
110
+
111
+ let model_name = format!("{:?}", preset.model);
112
+ let serializable = super::serialize::SerializableEmbeddingPreset {
113
+ name: preset.name,
114
+ chunk_size: preset.chunk_size,
115
+ overlap: preset.overlap,
116
+ model_name,
117
+ dimensions: preset.dimensions,
118
+ description: preset.description,
119
+ };
120
+
121
+ match serde_json::to_string(&serializable) {
122
+ Ok(json) => Ok(json),
123
+ Err(e) => Err(format!("Failed to serialize embedding preset: {}", e)),
124
+ }
125
+ }
126
+
127
+ #[cfg(test)]
128
+ mod tests {
129
+ use super::*;
130
+
131
+ #[test]
132
+ fn test_list_embedding_presets() {
133
+ let result = list_embedding_presets();
134
+ assert!(result.is_ok());
135
+ let json = result.unwrap();
136
+ assert!(json.starts_with('['));
137
+ assert!(json.ends_with(']'));
138
+ }
139
+
140
+ #[test]
141
+ fn test_get_embedding_preset_unknown() {
142
+ let result = get_embedding_preset("nonexistent_preset");
143
+ assert!(result.is_err());
144
+ }
145
+
146
+ #[test]
147
+ fn test_get_embedding_preset_valid() {
148
+ let result = get_embedding_preset("fast");
149
+ assert!(result.is_ok());
150
+ let json = result.unwrap();
151
+ assert!(json.contains("name"));
152
+ assert!(json.contains("chunk_size"));
153
+ }
154
+ }
@@ -0,0 +1,104 @@
1
+ //! Configuration merging logic
2
+ //!
3
+ //! Provides functionality to merge two ExtractionConfig instances.
4
+
5
+ use kreuzberg::core::config::ExtractionConfig;
6
+
7
+ /// Merge two configs (override takes precedence over base).
8
+ ///
9
+ /// Performs a shallow merge where fields from `override_config` take
10
+ /// precedence over fields in `base`. The `base` config is modified in-place.
11
+ ///
12
+ /// # Arguments
13
+ ///
14
+ /// * `base` - Mutable reference to the base config (will be modified)
15
+ /// * `override_config` - Reference to the override config (read-only)
16
+ pub fn merge_configs(base: &mut ExtractionConfig, override_config: &ExtractionConfig) {
17
+ base.use_cache = override_config.use_cache;
18
+ base.enable_quality_processing = override_config.enable_quality_processing;
19
+ base.force_ocr = override_config.force_ocr;
20
+ base.max_concurrent_extractions = override_config.max_concurrent_extractions;
21
+
22
+ if override_config.ocr.is_some() {
23
+ base.ocr = override_config.ocr.clone();
24
+ }
25
+
26
+ if override_config.chunking.is_some() {
27
+ base.chunking = override_config.chunking.clone();
28
+ }
29
+
30
+ if override_config.images.is_some() {
31
+ base.images = override_config.images.clone();
32
+ }
33
+
34
+ #[cfg(feature = "pdf")]
35
+ if override_config.pdf_options.is_some() {
36
+ base.pdf_options = override_config.pdf_options.clone();
37
+ }
38
+
39
+ if override_config.token_reduction.is_some() {
40
+ base.token_reduction = override_config.token_reduction.clone();
41
+ }
42
+
43
+ if override_config.language_detection.is_some() {
44
+ base.language_detection = override_config.language_detection.clone();
45
+ }
46
+
47
+ if override_config.pages.is_some() {
48
+ base.pages = override_config.pages.clone();
49
+ }
50
+
51
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
52
+ if override_config.keywords.is_some() {
53
+ base.keywords = override_config.keywords.clone();
54
+ }
55
+
56
+ if override_config.postprocessor.is_some() {
57
+ base.postprocessor = override_config.postprocessor.clone();
58
+ }
59
+
60
+ if override_config.html_options.is_some() {
61
+ base.html_options = override_config.html_options.clone();
62
+ }
63
+ }
64
+
65
+ #[cfg(test)]
66
+ mod tests {
67
+ use super::*;
68
+
69
+ #[test]
70
+ fn test_merge_configs_simple() {
71
+ let mut base = ExtractionConfig {
72
+ use_cache: true,
73
+ force_ocr: false,
74
+ ..Default::default()
75
+ };
76
+
77
+ let override_config = ExtractionConfig {
78
+ force_ocr: true,
79
+ ..Default::default()
80
+ };
81
+
82
+ merge_configs(&mut base, &override_config);
83
+
84
+ assert!(base.use_cache);
85
+ assert!(base.force_ocr);
86
+ }
87
+
88
+ #[test]
89
+ fn test_merge_configs_override_to_default() {
90
+ let mut base = ExtractionConfig {
91
+ use_cache: false,
92
+ ..Default::default()
93
+ };
94
+
95
+ let override_config = ExtractionConfig {
96
+ use_cache: true,
97
+ ..Default::default()
98
+ };
99
+
100
+ merge_configs(&mut base, &override_config);
101
+
102
+ assert!(base.use_cache, "override to default value should be applied");
103
+ }
104
+ }