kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -7,7 +7,7 @@
7
7
  //! All string-returning functions return pointers to C strings that MUST be freed
8
8
  //! with `kreuzberg_free_string()`.
9
9
 
10
- use crate::{clear_last_error, set_last_error};
10
+ use crate::{clear_last_error, ffi_panic_guard, set_last_error};
11
11
  use kreuzberg::types::ExtractionResult;
12
12
  use std::ffi::CString;
13
13
  use std::os::raw::c_char;
@@ -44,20 +44,26 @@ use std::ptr;
44
44
  /// ```
45
45
  #[unsafe(no_mangle)]
46
46
  pub unsafe extern "C" fn kreuzberg_result_get_page_count(result: *const ExtractionResult) -> i32 {
47
- if result.is_null() {
48
- set_last_error("Result cannot be NULL".to_string());
49
- return -1;
50
- }
47
+ ffi_panic_guard!(
48
+ "kreuzberg_result_get_page_count",
49
+ {
50
+ if result.is_null() {
51
+ set_last_error("Result cannot be NULL".to_string());
52
+ return -1;
53
+ }
51
54
 
52
- clear_last_error();
55
+ clear_last_error();
53
56
 
54
- let result_ref = unsafe { &*result };
57
+ let result_ref = unsafe { &*result };
55
58
 
56
- if let Some(metadata) = &result_ref.metadata.pages {
57
- metadata.total_count as i32
58
- } else {
59
- 0
60
- }
59
+ if let Some(metadata) = &result_ref.metadata.pages {
60
+ metadata.total_count as i32
61
+ } else {
62
+ 0
63
+ }
64
+ },
65
+ -1
66
+ )
61
67
  }
62
68
 
63
69
  /// Get chunk count from extraction result.
@@ -92,20 +98,26 @@ pub unsafe extern "C" fn kreuzberg_result_get_page_count(result: *const Extracti
92
98
  /// ```
93
99
  #[unsafe(no_mangle)]
94
100
  pub unsafe extern "C" fn kreuzberg_result_get_chunk_count(result: *const ExtractionResult) -> i32 {
95
- if result.is_null() {
96
- set_last_error("Result cannot be NULL".to_string());
97
- return -1;
98
- }
101
+ ffi_panic_guard!(
102
+ "kreuzberg_result_get_chunk_count",
103
+ {
104
+ if result.is_null() {
105
+ set_last_error("Result cannot be NULL".to_string());
106
+ return -1;
107
+ }
99
108
 
100
- clear_last_error();
109
+ clear_last_error();
101
110
 
102
- let result_ref = unsafe { &*result };
111
+ let result_ref = unsafe { &*result };
103
112
 
104
- if let Some(chunks) = &result_ref.chunks {
105
- chunks.len() as i32
106
- } else {
107
- 0
108
- }
113
+ if let Some(chunks) = &result_ref.chunks {
114
+ chunks.len() as i32
115
+ } else {
116
+ 0
117
+ }
118
+ },
119
+ -1
120
+ )
109
121
  }
110
122
 
111
123
  /// Get detected language from extraction result.
@@ -145,36 +157,38 @@ pub unsafe extern "C" fn kreuzberg_result_get_chunk_count(result: *const Extract
145
157
  /// ```
146
158
  #[unsafe(no_mangle)]
147
159
  pub unsafe extern "C" fn kreuzberg_result_get_detected_language(result: *const ExtractionResult) -> *mut c_char {
148
- if result.is_null() {
149
- set_last_error("Result cannot be NULL".to_string());
150
- return ptr::null_mut();
151
- }
160
+ ffi_panic_guard!("kreuzberg_result_get_detected_language", {
161
+ if result.is_null() {
162
+ set_last_error("Result cannot be NULL".to_string());
163
+ return ptr::null_mut();
164
+ }
152
165
 
153
- clear_last_error();
166
+ clear_last_error();
154
167
 
155
- let result_ref = unsafe { &*result };
168
+ let result_ref = unsafe { &*result };
156
169
 
157
- let language = if let Some(lang) = &result_ref.metadata.language {
158
- lang.clone()
159
- } else if let Some(langs) = &result_ref.detected_languages {
160
- if !langs.is_empty() {
161
- langs[0].clone()
170
+ let language = if let Some(lang) = &result_ref.metadata.language {
171
+ lang.clone()
172
+ } else if let Some(langs) = &result_ref.detected_languages {
173
+ if !langs.is_empty() {
174
+ langs[0].clone()
175
+ } else {
176
+ set_last_error("No language detected".to_string());
177
+ return ptr::null_mut();
178
+ }
162
179
  } else {
163
180
  set_last_error("No language detected".to_string());
164
181
  return ptr::null_mut();
182
+ };
183
+
184
+ match CString::new(language) {
185
+ Ok(c_string) => c_string.into_raw(),
186
+ Err(e) => {
187
+ set_last_error(format!("Failed to convert language to C string: {}", e));
188
+ ptr::null_mut()
189
+ }
165
190
  }
166
- } else {
167
- set_last_error("No language detected".to_string());
168
- return ptr::null_mut();
169
- };
170
-
171
- match CString::new(language) {
172
- Ok(c_string) => c_string.into_raw(),
173
- Err(e) => {
174
- set_last_error(format!("Failed to convert language to C string: {}", e));
175
- ptr::null_mut()
176
- }
177
- }
191
+ })
178
192
  }
179
193
 
180
194
  /// Metadata field accessor structure
@@ -246,58 +260,69 @@ pub unsafe extern "C" fn kreuzberg_result_get_metadata_field(
246
260
  result: *const ExtractionResult,
247
261
  field_name: *const c_char,
248
262
  ) -> CMetadataField {
249
- if result.is_null() {
250
- set_last_error("Result cannot be NULL".to_string());
251
- return CMetadataField {
252
- name: field_name,
253
- json_value: ptr::null_mut(),
254
- is_null: 1,
255
- };
256
- }
263
+ ffi_panic_guard!(
264
+ "kreuzberg_result_get_metadata_field",
265
+ {
266
+ if result.is_null() {
267
+ set_last_error("Result cannot be NULL".to_string());
268
+ return CMetadataField {
269
+ name: field_name,
270
+ json_value: ptr::null_mut(),
271
+ is_null: 1,
272
+ };
273
+ }
257
274
 
258
- if field_name.is_null() {
259
- set_last_error("Field name cannot be NULL".to_string());
260
- return CMetadataField {
261
- name: ptr::null(),
262
- json_value: ptr::null_mut(),
263
- is_null: 1,
264
- };
265
- }
275
+ if field_name.is_null() {
276
+ set_last_error("Field name cannot be NULL".to_string());
277
+ return CMetadataField {
278
+ name: ptr::null(),
279
+ json_value: ptr::null_mut(),
280
+ is_null: 1,
281
+ };
282
+ }
266
283
 
267
- clear_last_error();
284
+ clear_last_error();
268
285
 
269
- let field_str = match unsafe { std::ffi::CStr::from_ptr(field_name) }.to_str() {
270
- Ok(s) => s,
271
- Err(e) => {
272
- set_last_error(format!("Invalid UTF-8 in field name: {}", e));
273
- return CMetadataField {
274
- name: field_name,
275
- json_value: ptr::null_mut(),
276
- is_null: 1,
286
+ let field_str = match unsafe { std::ffi::CStr::from_ptr(field_name) }.to_str() {
287
+ Ok(s) => s,
288
+ Err(e) => {
289
+ set_last_error(format!("Invalid UTF-8 in field name: {}", e));
290
+ return CMetadataField {
291
+ name: field_name,
292
+ json_value: ptr::null_mut(),
293
+ is_null: 1,
294
+ };
295
+ }
277
296
  };
278
- }
279
- };
280
-
281
- let result_ref = unsafe { &*result };
282
-
283
- let metadata_json = match serde_json::to_value(&result_ref.metadata) {
284
- Ok(val) => val,
285
- Err(e) => {
286
- set_last_error(format!("Failed to serialize metadata: {}", e));
287
- return CMetadataField {
288
- name: field_name,
289
- json_value: ptr::null_mut(),
290
- is_null: 1,
297
+
298
+ let result_ref = unsafe { &*result };
299
+
300
+ let metadata_json = match serde_json::to_value(&result_ref.metadata) {
301
+ Ok(val) => val,
302
+ Err(e) => {
303
+ set_last_error(format!("Failed to serialize metadata: {}", e));
304
+ return CMetadataField {
305
+ name: field_name,
306
+ json_value: ptr::null_mut(),
307
+ is_null: 1,
308
+ };
309
+ }
291
310
  };
292
- }
293
- };
294
-
295
- let mut current = &metadata_json;
296
- for part in field_str.split('.') {
297
- if let Some(obj) = current.as_object() {
298
- match obj.get(part) {
299
- Some(val) => current = val,
300
- None => {
311
+
312
+ let mut current = &metadata_json;
313
+ for part in field_str.split('.') {
314
+ if let Some(obj) = current.as_object() {
315
+ match obj.get(part) {
316
+ Some(val) => current = val,
317
+ None => {
318
+ return CMetadataField {
319
+ name: field_name,
320
+ json_value: ptr::null_mut(),
321
+ is_null: 1,
322
+ };
323
+ }
324
+ }
325
+ } else {
301
326
  return CMetadataField {
302
327
  name: field_name,
303
328
  json_value: ptr::null_mut(),
@@ -305,40 +330,39 @@ pub unsafe extern "C" fn kreuzberg_result_get_metadata_field(
305
330
  };
306
331
  }
307
332
  }
308
- } else {
309
- return CMetadataField {
310
- name: field_name,
311
- json_value: ptr::null_mut(),
312
- is_null: 1,
313
- };
314
- }
315
- }
316
333
 
317
- match serde_json::to_string(current) {
318
- Ok(json) => match CString::new(json) {
319
- Ok(c_string) => CMetadataField {
320
- name: field_name,
321
- json_value: c_string.into_raw(),
322
- is_null: 0,
323
- },
324
- Err(e) => {
325
- set_last_error(format!("Failed to convert field value to C string: {}", e));
326
- CMetadataField {
327
- name: field_name,
328
- json_value: ptr::null_mut(),
329
- is_null: 1,
334
+ match serde_json::to_string(current) {
335
+ Ok(json) => match CString::new(json) {
336
+ Ok(c_string) => CMetadataField {
337
+ name: field_name,
338
+ json_value: c_string.into_raw(),
339
+ is_null: 0,
340
+ },
341
+ Err(e) => {
342
+ set_last_error(format!("Failed to convert field value to C string: {}", e));
343
+ CMetadataField {
344
+ name: field_name,
345
+ json_value: ptr::null_mut(),
346
+ is_null: 1,
347
+ }
348
+ }
349
+ },
350
+ Err(e) => {
351
+ set_last_error(format!("Failed to serialize field value: {}", e));
352
+ CMetadataField {
353
+ name: field_name,
354
+ json_value: ptr::null_mut(),
355
+ is_null: 1,
356
+ }
330
357
  }
331
358
  }
332
359
  },
333
- Err(e) => {
334
- set_last_error(format!("Failed to serialize field value: {}", e));
335
- CMetadataField {
336
- name: field_name,
337
- json_value: ptr::null_mut(),
338
- is_null: 1,
339
- }
360
+ CMetadataField {
361
+ name: field_name,
362
+ json_value: ptr::null_mut(),
363
+ is_null: 1,
340
364
  }
341
- }
365
+ )
342
366
  }
343
367
 
344
368
  #[cfg(test)]
@@ -399,6 +423,8 @@ mod tests {
399
423
  ]),
400
424
  images: None,
401
425
  pages: None,
426
+ djot_content: None,
427
+ elements: None,
402
428
  }
403
429
  }
404
430
 
@@ -453,6 +453,8 @@ mod tests {
453
453
  ]),
454
454
  images: None,
455
455
  pages: None,
456
+ djot_content: None,
457
+ elements: None,
456
458
  }
457
459
  }
458
460
 
@@ -717,6 +719,8 @@ mod tests {
717
719
  chunks: None,
718
720
  images: None,
719
721
  pages: None,
722
+ djot_content: None,
723
+ elements: None,
720
724
  };
721
725
 
722
726
  let result_ptr = &result as *const ExtractionResult;
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.0.8"
3
+ version = "4.1.0"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -26,7 +26,7 @@ image = { workspace = true }
26
26
  [build-dependencies]
27
27
  cc = { version = "^1.2.53", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
- zip = { version = "7.1.0", optional = true }
29
+ zip = { version = "7.2.0", optional = true }
30
30
 
31
31
  # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
32
32
  [target.'cfg(target_os = "windows")'.build-dependencies]