kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -7,7 +7,7 @@
7
7
  //! All string-returning functions return pointers to C strings that MUST be freed
8
8
  //! with `kreuzberg_free_string()`.
9
9
 
10
- use crate::{clear_last_error, set_last_error};
10
+ use crate::{clear_last_error, ffi_panic_guard, set_last_error};
11
11
  use kreuzberg::types::ExtractionResult;
12
12
  use std::ffi::CString;
13
13
  use std::os::raw::c_char;
@@ -44,20 +44,26 @@ use std::ptr;
44
44
  /// ```
45
45
  #[unsafe(no_mangle)]
46
46
  pub unsafe extern "C" fn kreuzberg_result_get_page_count(result: *const ExtractionResult) -> i32 {
47
- if result.is_null() {
48
- set_last_error("Result cannot be NULL".to_string());
49
- return -1;
50
- }
47
+ ffi_panic_guard!(
48
+ "kreuzberg_result_get_page_count",
49
+ {
50
+ if result.is_null() {
51
+ set_last_error("Result cannot be NULL".to_string());
52
+ return -1;
53
+ }
51
54
 
52
- clear_last_error();
55
+ clear_last_error();
53
56
 
54
- let result_ref = unsafe { &*result };
57
+ let result_ref = unsafe { &*result };
55
58
 
56
- if let Some(metadata) = &result_ref.metadata.pages {
57
- metadata.total_count as i32
58
- } else {
59
- 0
60
- }
59
+ if let Some(metadata) = &result_ref.metadata.pages {
60
+ metadata.total_count as i32
61
+ } else {
62
+ 0
63
+ }
64
+ },
65
+ -1
66
+ )
61
67
  }
62
68
 
63
69
  /// Get chunk count from extraction result.
@@ -92,20 +98,26 @@ pub unsafe extern "C" fn kreuzberg_result_get_page_count(result: *const Extracti
92
98
  /// ```
93
99
  #[unsafe(no_mangle)]
94
100
  pub unsafe extern "C" fn kreuzberg_result_get_chunk_count(result: *const ExtractionResult) -> i32 {
95
- if result.is_null() {
96
- set_last_error("Result cannot be NULL".to_string());
97
- return -1;
98
- }
101
+ ffi_panic_guard!(
102
+ "kreuzberg_result_get_chunk_count",
103
+ {
104
+ if result.is_null() {
105
+ set_last_error("Result cannot be NULL".to_string());
106
+ return -1;
107
+ }
99
108
 
100
- clear_last_error();
109
+ clear_last_error();
101
110
 
102
- let result_ref = unsafe { &*result };
111
+ let result_ref = unsafe { &*result };
103
112
 
104
- if let Some(chunks) = &result_ref.chunks {
105
- chunks.len() as i32
106
- } else {
107
- 0
108
- }
113
+ if let Some(chunks) = &result_ref.chunks {
114
+ chunks.len() as i32
115
+ } else {
116
+ 0
117
+ }
118
+ },
119
+ -1
120
+ )
109
121
  }
110
122
 
111
123
  /// Get detected language from extraction result.
@@ -145,36 +157,38 @@ pub unsafe extern "C" fn kreuzberg_result_get_chunk_count(result: *const Extract
145
157
  /// ```
146
158
  #[unsafe(no_mangle)]
147
159
  pub unsafe extern "C" fn kreuzberg_result_get_detected_language(result: *const ExtractionResult) -> *mut c_char {
148
- if result.is_null() {
149
- set_last_error("Result cannot be NULL".to_string());
150
- return ptr::null_mut();
151
- }
160
+ ffi_panic_guard!("kreuzberg_result_get_detected_language", {
161
+ if result.is_null() {
162
+ set_last_error("Result cannot be NULL".to_string());
163
+ return ptr::null_mut();
164
+ }
152
165
 
153
- clear_last_error();
166
+ clear_last_error();
154
167
 
155
- let result_ref = unsafe { &*result };
168
+ let result_ref = unsafe { &*result };
156
169
 
157
- let language = if let Some(lang) = &result_ref.metadata.language {
158
- lang.clone()
159
- } else if let Some(langs) = &result_ref.detected_languages {
160
- if !langs.is_empty() {
161
- langs[0].clone()
170
+ let language = if let Some(lang) = &result_ref.metadata.language {
171
+ lang.clone()
172
+ } else if let Some(langs) = &result_ref.detected_languages {
173
+ if !langs.is_empty() {
174
+ langs[0].clone()
175
+ } else {
176
+ set_last_error("No language detected".to_string());
177
+ return ptr::null_mut();
178
+ }
162
179
  } else {
163
180
  set_last_error("No language detected".to_string());
164
181
  return ptr::null_mut();
182
+ };
183
+
184
+ match CString::new(language) {
185
+ Ok(c_string) => c_string.into_raw(),
186
+ Err(e) => {
187
+ set_last_error(format!("Failed to convert language to C string: {}", e));
188
+ ptr::null_mut()
189
+ }
165
190
  }
166
- } else {
167
- set_last_error("No language detected".to_string());
168
- return ptr::null_mut();
169
- };
170
-
171
- match CString::new(language) {
172
- Ok(c_string) => c_string.into_raw(),
173
- Err(e) => {
174
- set_last_error(format!("Failed to convert language to C string: {}", e));
175
- ptr::null_mut()
176
- }
177
- }
191
+ })
178
192
  }
179
193
 
180
194
  /// Metadata field accessor structure
@@ -246,58 +260,69 @@ pub unsafe extern "C" fn kreuzberg_result_get_metadata_field(
246
260
  result: *const ExtractionResult,
247
261
  field_name: *const c_char,
248
262
  ) -> CMetadataField {
249
- if result.is_null() {
250
- set_last_error("Result cannot be NULL".to_string());
251
- return CMetadataField {
252
- name: field_name,
253
- json_value: ptr::null_mut(),
254
- is_null: 1,
255
- };
256
- }
263
+ ffi_panic_guard!(
264
+ "kreuzberg_result_get_metadata_field",
265
+ {
266
+ if result.is_null() {
267
+ set_last_error("Result cannot be NULL".to_string());
268
+ return CMetadataField {
269
+ name: field_name,
270
+ json_value: ptr::null_mut(),
271
+ is_null: 1,
272
+ };
273
+ }
257
274
 
258
- if field_name.is_null() {
259
- set_last_error("Field name cannot be NULL".to_string());
260
- return CMetadataField {
261
- name: ptr::null(),
262
- json_value: ptr::null_mut(),
263
- is_null: 1,
264
- };
265
- }
275
+ if field_name.is_null() {
276
+ set_last_error("Field name cannot be NULL".to_string());
277
+ return CMetadataField {
278
+ name: ptr::null(),
279
+ json_value: ptr::null_mut(),
280
+ is_null: 1,
281
+ };
282
+ }
266
283
 
267
- clear_last_error();
284
+ clear_last_error();
268
285
 
269
- let field_str = match unsafe { std::ffi::CStr::from_ptr(field_name) }.to_str() {
270
- Ok(s) => s,
271
- Err(e) => {
272
- set_last_error(format!("Invalid UTF-8 in field name: {}", e));
273
- return CMetadataField {
274
- name: field_name,
275
- json_value: ptr::null_mut(),
276
- is_null: 1,
286
+ let field_str = match unsafe { std::ffi::CStr::from_ptr(field_name) }.to_str() {
287
+ Ok(s) => s,
288
+ Err(e) => {
289
+ set_last_error(format!("Invalid UTF-8 in field name: {}", e));
290
+ return CMetadataField {
291
+ name: field_name,
292
+ json_value: ptr::null_mut(),
293
+ is_null: 1,
294
+ };
295
+ }
277
296
  };
278
- }
279
- };
280
-
281
- let result_ref = unsafe { &*result };
282
-
283
- let metadata_json = match serde_json::to_value(&result_ref.metadata) {
284
- Ok(val) => val,
285
- Err(e) => {
286
- set_last_error(format!("Failed to serialize metadata: {}", e));
287
- return CMetadataField {
288
- name: field_name,
289
- json_value: ptr::null_mut(),
290
- is_null: 1,
297
+
298
+ let result_ref = unsafe { &*result };
299
+
300
+ let metadata_json = match serde_json::to_value(&result_ref.metadata) {
301
+ Ok(val) => val,
302
+ Err(e) => {
303
+ set_last_error(format!("Failed to serialize metadata: {}", e));
304
+ return CMetadataField {
305
+ name: field_name,
306
+ json_value: ptr::null_mut(),
307
+ is_null: 1,
308
+ };
309
+ }
291
310
  };
292
- }
293
- };
294
-
295
- let mut current = &metadata_json;
296
- for part in field_str.split('.') {
297
- if let Some(obj) = current.as_object() {
298
- match obj.get(part) {
299
- Some(val) => current = val,
300
- None => {
311
+
312
+ let mut current = &metadata_json;
313
+ for part in field_str.split('.') {
314
+ if let Some(obj) = current.as_object() {
315
+ match obj.get(part) {
316
+ Some(val) => current = val,
317
+ None => {
318
+ return CMetadataField {
319
+ name: field_name,
320
+ json_value: ptr::null_mut(),
321
+ is_null: 1,
322
+ };
323
+ }
324
+ }
325
+ } else {
301
326
  return CMetadataField {
302
327
  name: field_name,
303
328
  json_value: ptr::null_mut(),
@@ -305,40 +330,39 @@ pub unsafe extern "C" fn kreuzberg_result_get_metadata_field(
305
330
  };
306
331
  }
307
332
  }
308
- } else {
309
- return CMetadataField {
310
- name: field_name,
311
- json_value: ptr::null_mut(),
312
- is_null: 1,
313
- };
314
- }
315
- }
316
333
 
317
- match serde_json::to_string(current) {
318
- Ok(json) => match CString::new(json) {
319
- Ok(c_string) => CMetadataField {
320
- name: field_name,
321
- json_value: c_string.into_raw(),
322
- is_null: 0,
323
- },
324
- Err(e) => {
325
- set_last_error(format!("Failed to convert field value to C string: {}", e));
326
- CMetadataField {
327
- name: field_name,
328
- json_value: ptr::null_mut(),
329
- is_null: 1,
334
+ match serde_json::to_string(current) {
335
+ Ok(json) => match CString::new(json) {
336
+ Ok(c_string) => CMetadataField {
337
+ name: field_name,
338
+ json_value: c_string.into_raw(),
339
+ is_null: 0,
340
+ },
341
+ Err(e) => {
342
+ set_last_error(format!("Failed to convert field value to C string: {}", e));
343
+ CMetadataField {
344
+ name: field_name,
345
+ json_value: ptr::null_mut(),
346
+ is_null: 1,
347
+ }
348
+ }
349
+ },
350
+ Err(e) => {
351
+ set_last_error(format!("Failed to serialize field value: {}", e));
352
+ CMetadataField {
353
+ name: field_name,
354
+ json_value: ptr::null_mut(),
355
+ is_null: 1,
356
+ }
330
357
  }
331
358
  }
332
359
  },
333
- Err(e) => {
334
- set_last_error(format!("Failed to serialize field value: {}", e));
335
- CMetadataField {
336
- name: field_name,
337
- json_value: ptr::null_mut(),
338
- is_null: 1,
339
- }
360
+ CMetadataField {
361
+ name: field_name,
362
+ json_value: ptr::null_mut(),
363
+ is_null: 1,
340
364
  }
341
- }
365
+ )
342
366
  }
343
367
 
344
368
  #[cfg(test)]
@@ -399,6 +423,8 @@ mod tests {
399
423
  ]),
400
424
  images: None,
401
425
  pages: None,
426
+ djot_content: None,
427
+ elements: None,
402
428
  }
403
429
  }
404
430
 
@@ -453,6 +453,8 @@ mod tests {
453
453
  ]),
454
454
  images: None,
455
455
  pages: None,
456
+ djot_content: None,
457
+ elements: None,
456
458
  }
457
459
  }
458
460
 
@@ -717,6 +719,8 @@ mod tests {
717
719
  chunks: None,
718
720
  images: None,
719
721
  pages: None,
722
+ djot_content: None,
723
+ elements: None,
720
724
  };
721
725
 
722
726
  let result_ptr = &result as *const ExtractionResult;
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.0.8"
3
+ version = "4.1.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -26,7 +26,7 @@ image = { workspace = true }
26
26
  [build-dependencies]
27
27
  cc = { version = "^1.2.53", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
- zip = { version = "7.1.0", optional = true }
29
+ zip = { version = "7.2.0", optional = true }
30
30
 
31
31
  # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
32
32
  [target.'cfg(target_os = "windows")'.build-dependencies]