kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,326 @@
1
+ //! ExtractionResult conversion to Ruby values
2
+ //!
3
+ //! Handles conversion of Kreuzberg ExtractionResult to Ruby Hash,
4
+ //! including complex nested structures like chunks, images, tables, and elements.
5
+
6
+ use crate::error_handling::runtime_error;
7
+ use crate::helpers::{json_value_to_ruby, set_hash_entry};
8
+
9
+ use kreuzberg::ExtractionResult as RustExtractionResult;
10
+ use magnus::{Error, RHash, Ruby, IntoValue};
11
+ use magnus::value::ReprValue;
12
+
13
+ /// Convert Kreuzberg ExtractionResult to Ruby Hash
14
+ ///
15
+ /// Converts the Rust extraction result into a Ruby hash with all fields including:
16
+ /// - content, mime_type, metadata
17
+ /// - tables (with cells and markdown)
18
+ /// - detected_languages
19
+ /// - chunks (with embeddings)
20
+ /// - images (including OCR results)
21
+ /// - pages (with per-page content)
22
+ /// - elements (for element-based format)
23
+ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Result<RHash, Error> {
24
+ let hash = ruby.hash_new();
25
+
26
+ // Set content and MIME type
27
+ let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
28
+ set_hash_entry(ruby, &hash, "content", content_value)?;
29
+
30
+ let mime_value = ruby.str_new(result.mime_type.as_str()).into_value_with(ruby);
31
+ set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
32
+
33
+ // Set metadata both as JSON string and parsed hash
34
+ let metadata_json = serde_json::to_string(&result.metadata)
35
+ .map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
36
+ let metadata_json_value = ruby.str_new(&metadata_json).into_value_with(ruby);
37
+ set_hash_entry(ruby, &hash, "metadata_json", metadata_json_value)?;
38
+ let metadata_value = serde_json::to_value(&result.metadata)
39
+ .map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
40
+ let metadata_hash = json_value_to_ruby(ruby, &metadata_value)?;
41
+ set_hash_entry(ruby, &hash, "metadata", metadata_hash)?;
42
+
43
+ // Convert tables
44
+ let tables_array = ruby.ary_new();
45
+ for table in result.tables {
46
+ let table_hash = ruby.hash_new();
47
+
48
+ let cells_array = ruby.ary_new();
49
+ for row in table.cells {
50
+ let row_array = ruby.ary_from_vec(row);
51
+ cells_array.push(row_array)?;
52
+ }
53
+ table_hash.aset("cells", cells_array)?;
54
+ table_hash.aset("markdown", table.markdown)?;
55
+ table_hash.aset("page_number", table.page_number)?;
56
+
57
+ tables_array.push(table_hash)?;
58
+ }
59
+ let tables_value = tables_array.into_value_with(ruby);
60
+ set_hash_entry(ruby, &hash, "tables", tables_value)?;
61
+
62
+ // Convert detected languages
63
+ if let Some(langs) = result.detected_languages {
64
+ let langs_array = ruby.ary_from_vec(langs);
65
+ let langs_value = langs_array.into_value_with(ruby);
66
+ set_hash_entry(ruby, &hash, "detected_languages", langs_value)?;
67
+ } else {
68
+ set_hash_entry(ruby, &hash, "detected_languages", ruby.qnil().as_value())?;
69
+ }
70
+
71
+ // Convert chunks
72
+ if let Some(chunks) = result.chunks {
73
+ let chunks_array = ruby.ary_new();
74
+ for chunk in chunks {
75
+ let chunk_hash = ruby.hash_new();
76
+ chunk_hash.aset("content", chunk.content)?;
77
+ chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
78
+ chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
79
+ if let Some(token_count) = chunk.metadata.token_count {
80
+ chunk_hash.aset("token_count", token_count)?;
81
+ } else {
82
+ chunk_hash.aset("token_count", ruby.qnil().as_value())?;
83
+ }
84
+ chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
85
+ chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
86
+ if let Some(first_page) = chunk.metadata.first_page {
87
+ chunk_hash.aset("first_page", first_page as i64)?;
88
+ } else {
89
+ chunk_hash.aset("first_page", ruby.qnil().as_value())?;
90
+ }
91
+ if let Some(last_page) = chunk.metadata.last_page {
92
+ chunk_hash.aset("last_page", last_page as i64)?;
93
+ } else {
94
+ chunk_hash.aset("last_page", ruby.qnil().as_value())?;
95
+ }
96
+ if let Some(embedding) = chunk.embedding {
97
+ let embedding_array = ruby.ary_new();
98
+ for value in embedding {
99
+ embedding_array.push(ruby.float_from_f64(value as f64).into_value_with(ruby))?;
100
+ }
101
+ chunk_hash.aset("embedding", embedding_array)?;
102
+ } else {
103
+ chunk_hash.aset("embedding", ruby.qnil().as_value())?;
104
+ }
105
+ chunks_array.push(chunk_hash)?;
106
+ }
107
+ let chunks_value = chunks_array.into_value_with(ruby);
108
+ set_hash_entry(ruby, &hash, "chunks", chunks_value)?;
109
+ } else {
110
+ set_hash_entry(ruby, &hash, "chunks", ruby.qnil().as_value())?;
111
+ }
112
+
113
+ // Convert images
114
+ if let Some(images) = result.images {
115
+ let images_array = ruby.ary_new();
116
+ for image in images {
117
+ let image_hash = ruby.hash_new();
118
+ let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
119
+ image_hash.aset("data", data_value)?;
120
+ image_hash.aset("format", image.format)?;
121
+ image_hash.aset("image_index", image.image_index as i64)?;
122
+ if let Some(page) = image.page_number {
123
+ image_hash.aset("page_number", page as i64)?;
124
+ } else {
125
+ image_hash.aset("page_number", ruby.qnil().as_value())?;
126
+ }
127
+ if let Some(width) = image.width {
128
+ image_hash.aset("width", width as i64)?;
129
+ } else {
130
+ image_hash.aset("width", ruby.qnil().as_value())?;
131
+ }
132
+ if let Some(height) = image.height {
133
+ image_hash.aset("height", height as i64)?;
134
+ } else {
135
+ image_hash.aset("height", ruby.qnil().as_value())?;
136
+ }
137
+ if let Some(colorspace) = image.colorspace {
138
+ image_hash.aset("colorspace", colorspace)?;
139
+ } else {
140
+ image_hash.aset("colorspace", ruby.qnil().as_value())?;
141
+ }
142
+ if let Some(bits) = image.bits_per_component {
143
+ image_hash.aset("bits_per_component", bits as i64)?;
144
+ } else {
145
+ image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
146
+ }
147
+ image_hash.aset(
148
+ "is_mask",
149
+ if image.is_mask {
150
+ ruby.qtrue().as_value()
151
+ } else {
152
+ ruby.qfalse().as_value()
153
+ },
154
+ )?;
155
+ if let Some(description) = image.description {
156
+ image_hash.aset("description", description)?;
157
+ } else {
158
+ image_hash.aset("description", ruby.qnil().as_value())?;
159
+ }
160
+ if let Some(ocr_result) = image.ocr_result {
161
+ let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
162
+ image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
163
+ } else {
164
+ image_hash.aset("ocr_result", ruby.qnil().as_value())?;
165
+ }
166
+ images_array.push(image_hash)?;
167
+ }
168
+ set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
169
+ } else {
170
+ set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
171
+ }
172
+
173
+ // Convert pages
174
+ if let Some(page_content_list) = result.pages {
175
+ let pages_array = ruby.ary_new();
176
+ for page_content in page_content_list {
177
+ let page_hash = ruby.hash_new();
178
+ page_hash.aset("page_number", page_content.page_number as i64)?;
179
+ page_hash.aset("content", page_content.content)?;
180
+
181
+ let tables_array = ruby.ary_new();
182
+ for table in page_content.tables {
183
+ let table_hash = ruby.hash_new();
184
+
185
+ let cells_array = ruby.ary_new();
186
+ for row in table.cells.clone() {
187
+ let row_array = ruby.ary_from_vec(row);
188
+ cells_array.push(row_array)?;
189
+ }
190
+ table_hash.aset("cells", cells_array)?;
191
+ table_hash.aset("markdown", table.markdown.clone())?;
192
+ table_hash.aset("page_number", table.page_number as i64)?;
193
+
194
+ tables_array.push(table_hash)?;
195
+ }
196
+ page_hash.aset("tables", tables_array)?;
197
+
198
+ let images_array = ruby.ary_new();
199
+ for image in page_content.images {
200
+ let image_hash = ruby.hash_new();
201
+ let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
202
+ image_hash.aset("data", data_value)?;
203
+ image_hash.aset("format", image.format.clone())?;
204
+ image_hash.aset("image_index", image.image_index as i64)?;
205
+ if let Some(page) = image.page_number {
206
+ image_hash.aset("page_number", page as i64)?;
207
+ } else {
208
+ image_hash.aset("page_number", ruby.qnil().as_value())?;
209
+ }
210
+ if let Some(width) = image.width {
211
+ image_hash.aset("width", width as i64)?;
212
+ } else {
213
+ image_hash.aset("width", ruby.qnil().as_value())?;
214
+ }
215
+ if let Some(height) = image.height {
216
+ image_hash.aset("height", height as i64)?;
217
+ } else {
218
+ image_hash.aset("height", ruby.qnil().as_value())?;
219
+ }
220
+ if let Some(colorspace) = &image.colorspace {
221
+ image_hash.aset("colorspace", colorspace.clone())?;
222
+ } else {
223
+ image_hash.aset("colorspace", ruby.qnil().as_value())?;
224
+ }
225
+ if let Some(bits) = image.bits_per_component {
226
+ image_hash.aset("bits_per_component", bits as i64)?;
227
+ } else {
228
+ image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
229
+ }
230
+ image_hash.aset(
231
+ "is_mask",
232
+ if image.is_mask {
233
+ ruby.qtrue().as_value()
234
+ } else {
235
+ ruby.qfalse().as_value()
236
+ },
237
+ )?;
238
+ if let Some(description) = &image.description {
239
+ image_hash.aset("description", description.clone())?;
240
+ } else {
241
+ image_hash.aset("description", ruby.qnil().as_value())?;
242
+ }
243
+ if let Some(ocr_result) = &image.ocr_result {
244
+ let nested = extraction_result_to_ruby(ruby, (**ocr_result).clone())?;
245
+ image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
246
+ } else {
247
+ image_hash.aset("ocr_result", ruby.qnil().as_value())?;
248
+ }
249
+ images_array.push(image_hash)?;
250
+ }
251
+ page_hash.aset("images", images_array)?;
252
+
253
+ pages_array.push(page_hash)?;
254
+ }
255
+ set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
256
+ } else {
257
+ set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
258
+ }
259
+
260
+ // Convert elements (element-based format)
261
+ if let Some(elements_list) = result.elements {
262
+ let elements_array = ruby.ary_new();
263
+ for element in elements_list {
264
+ let element_hash = ruby.hash_new();
265
+ element_hash.aset("element_id", element.element_id.as_ref())?;
266
+
267
+ // Convert ElementType to snake_case string
268
+ use kreuzberg::types::ElementType as ET;
269
+ let element_type_str = match element.element_type {
270
+ ET::Title => "title",
271
+ ET::NarrativeText => "narrative_text",
272
+ ET::Heading => "heading",
273
+ ET::ListItem => "list_item",
274
+ ET::Table => "table",
275
+ ET::Image => "image",
276
+ ET::PageBreak => "page_break",
277
+ ET::CodeBlock => "code_block",
278
+ ET::BlockQuote => "block_quote",
279
+ ET::Footer => "footer",
280
+ ET::Header => "header",
281
+ };
282
+ element_hash.aset("element_type", element_type_str)?;
283
+ element_hash.aset("text", element.text)?;
284
+
285
+ let metadata_hash = ruby.hash_new();
286
+ if let Some(page_num) = element.metadata.page_number {
287
+ metadata_hash.aset("page_number", page_num as i64)?;
288
+ } else {
289
+ metadata_hash.aset("page_number", ruby.qnil().as_value())?;
290
+ }
291
+ if let Some(filename) = &element.metadata.filename {
292
+ metadata_hash.aset("filename", filename.as_str())?;
293
+ } else {
294
+ metadata_hash.aset("filename", ruby.qnil().as_value())?;
295
+ }
296
+ if let Some(coords) = element.metadata.coordinates {
297
+ let coords_hash = ruby.hash_new();
298
+ coords_hash.aset("x0", coords.x0)?;
299
+ coords_hash.aset("y0", coords.y0)?;
300
+ coords_hash.aset("x1", coords.x1)?;
301
+ coords_hash.aset("y1", coords.y1)?;
302
+ metadata_hash.aset("coordinates", coords_hash)?;
303
+ } else {
304
+ metadata_hash.aset("coordinates", ruby.qnil().as_value())?;
305
+ }
306
+ if let Some(elem_idx) = element.metadata.element_index {
307
+ metadata_hash.aset("element_index", elem_idx as i64)?;
308
+ } else {
309
+ metadata_hash.aset("element_index", ruby.qnil().as_value())?;
310
+ }
311
+ let additional_hash = ruby.hash_new();
312
+ for (key, value) in &element.metadata.additional {
313
+ additional_hash.aset(key.as_str(), value.as_str())?;
314
+ }
315
+ metadata_hash.aset("additional", additional_hash)?;
316
+
317
+ element_hash.aset("metadata", metadata_hash)?;
318
+ elements_array.push(element_hash)?;
319
+ }
320
+ set_hash_entry(ruby, &hash, "elements", elements_array.into_value_with(ruby))?;
321
+ } else {
322
+ set_hash_entry(ruby, &hash, "elements", ruby.qnil().as_value())?;
323
+ }
324
+
325
+ Ok(hash)
326
+ }
@@ -0,0 +1,4 @@
1
+ //! Validation functions for configuration and formats
2
+ //!
3
+ //! Provides validation for MIME types, formats, and other configuration parameters.
4
+ //! These validation functions are re-exported directly from lib.rs via kreuzberg_ffi.
@@ -733,11 +733,42 @@ module Kreuzberg
733
733
  # @example Load from YAML
734
734
  # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
735
735
  #
736
+ # Keys that are allowed in the Extraction config
737
+ ALLOWED_KEYS = %i[
738
+ use_cache enable_quality_processing force_ocr ocr chunking
739
+ language_detection pdf_options image_extraction image_preprocessing
740
+ postprocessor token_reduction keywords html_options pages
741
+ max_concurrent_extractions
742
+ ].freeze
743
+
744
+ # Aliases for backward compatibility
745
+ KEY_ALIASES = {
746
+ images: :image_extraction
747
+ }.freeze
748
+
736
749
  def self.from_file(path)
737
750
  hash = Kreuzberg._config_from_file_native(path)
738
- new(**hash.transform_keys(&:to_sym))
751
+ new(**normalize_hash_keys(hash))
752
+ end
753
+
754
+ # Normalize hash keys from native function
755
+ # - Converts string keys to symbols
756
+ # - Maps aliased keys to their canonical names
757
+ # - Filters out unknown keys
758
+ def self.normalize_hash_keys(hash)
759
+ symbolized = hash.transform_keys(&:to_sym)
760
+
761
+ # Apply key aliases
762
+ KEY_ALIASES.each do |from, to|
763
+ symbolized[to] = symbolized.delete(from) if symbolized.key?(from) && !symbolized.key?(to)
764
+ end
765
+
766
+ # Filter to only allowed keys
767
+ symbolized.slice(*ALLOWED_KEYS)
739
768
  end
740
769
 
770
+ private_class_method :normalize_hash_keys
771
+
741
772
  # Discover configuration file in current or parent directories.
742
773
  #
743
774
  # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
@@ -755,7 +786,7 @@ module Kreuzberg
755
786
  hash = Kreuzberg._config_discover_native
756
787
  return nil if hash.nil?
757
788
 
758
- new(**hash.transform_keys(&:to_sym))
789
+ new(**normalize_hash_keys(hash))
759
790
  end
760
791
 
761
792
  def initialize(
@@ -905,6 +936,72 @@ module Kreuzberg
905
936
  self
906
937
  end
907
938
 
939
+ # Set a configuration field using hash-like syntax
940
+ #
941
+ # @param key [Symbol, String] Field name to set
942
+ # @param value [Object] Value to set
943
+ # @return [Object] The value that was set
944
+ #
945
+ # @example
946
+ # config = Extraction.new(use_cache: true)
947
+ # config[:use_cache] = false
948
+ # config[:force_ocr] = true
949
+ #
950
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
951
+ def []=(key, value)
952
+ key_sym = key.to_sym
953
+ case key_sym
954
+ when :use_cache
955
+ @use_cache = value ? true : false
956
+ when :enable_quality_processing
957
+ @enable_quality_processing = value ? true : false
958
+ when :force_ocr
959
+ @force_ocr = value ? true : false
960
+ when :ocr
961
+ @ocr = normalize_config(value, OCR)
962
+ when :chunking
963
+ @chunking = normalize_config(value, Chunking)
964
+ when :language_detection
965
+ @language_detection = normalize_config(value, LanguageDetection)
966
+ when :pdf_options
967
+ @pdf_options = normalize_config(value, PDF)
968
+ when :image_extraction
969
+ @image_extraction = normalize_config(value, ImageExtraction)
970
+ when :image_preprocessing
971
+ @image_preprocessing = normalize_config(value, ImagePreprocessing)
972
+ when :postprocessor
973
+ @postprocessor = normalize_config(value, PostProcessor)
974
+ when :token_reduction
975
+ @token_reduction = normalize_config(value, TokenReduction)
976
+ when :keywords
977
+ @keywords = normalize_config(value, Keywords)
978
+ when :html_options
979
+ @html_options = normalize_config(value, HtmlOptions)
980
+ when :pages
981
+ @pages = normalize_config(value, PageConfig)
982
+ when :max_concurrent_extractions
983
+ @max_concurrent_extractions = value&.to_i
984
+ else
985
+ raise ArgumentError, "Unknown configuration key: #{key}"
986
+ end
987
+ end
988
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
989
+
990
+ # Get a configuration field using hash-like syntax
991
+ #
992
+ # @param key [Symbol, String] Field name to get
993
+ # @return [Object, nil] The field value
994
+ #
995
+ # @example
996
+ # config = Extraction.new(use_cache: true)
997
+ # config[:use_cache] # => true
998
+ #
999
+ def [](key)
1000
+ send(key.to_sym)
1001
+ rescue NoMethodError
1002
+ nil
1003
+ end
1004
+
908
1005
  private
909
1006
 
910
1007
  def normalize_config(value, klass)
@@ -11,7 +11,7 @@ module Kreuzberg
11
11
  # rubocop:disable Metrics/ClassLength
12
12
  class Result
13
13
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
14
- :detected_languages, :chunks, :images, :pages
14
+ :detected_languages, :chunks, :images, :pages, :elements
15
15
 
16
16
  # @!attribute [r] cells
17
17
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -114,6 +114,68 @@ module Kreuzberg
114
114
  end
115
115
  end
116
116
 
117
+ # @!attribute [r] x0
118
+ # @return [Float] Left x-coordinate
119
+ # @!attribute [r] y0
120
+ # @return [Float] Bottom y-coordinate
121
+ # @!attribute [r] x1
122
+ # @return [Float] Right x-coordinate
123
+ # @!attribute [r] y1
124
+ # @return [Float] Top y-coordinate
125
+ ElementBoundingBox = Struct.new(:x0, :y0, :x1, :y1, keyword_init: true) do
126
+ def to_h
127
+ { x0: x0, y0: y0, x1: x1, y1: y1 }
128
+ end
129
+ end
130
+
131
+ # @!attribute [r] page_number
132
+ # @return [Integer, nil] Page number (1-indexed)
133
+ # @!attribute [r] filename
134
+ # @return [String, nil] Source filename or document name
135
+ # @!attribute [r] coordinates
136
+ # @return [ElementBoundingBox, nil] Bounding box coordinates if available
137
+ # @!attribute [r] element_index
138
+ # @return [Integer, nil] Position index in the element sequence
139
+ # @!attribute [r] additional
140
+ # @return [Hash<String, String>] Additional custom metadata
141
+ ElementMetadataStruct = Struct.new(
142
+ :page_number,
143
+ :filename,
144
+ :coordinates,
145
+ :element_index,
146
+ :additional,
147
+ keyword_init: true
148
+ ) do
149
+ def to_h
150
+ {
151
+ page_number: page_number,
152
+ filename: filename,
153
+ coordinates: coordinates&.to_h,
154
+ element_index: element_index,
155
+ additional: additional
156
+ }
157
+ end
158
+ end
159
+
160
+ # @!attribute [r] element_id
161
+ # @return [String] Unique element identifier
162
+ # @!attribute [r] element_type
163
+ # @return [String] Semantic type of the element
164
+ # @!attribute [r] text
165
+ # @return [String] Text content of the element
166
+ # @!attribute [r] metadata
167
+ # @return [ElementMetadataStruct] Metadata about the element
168
+ ElementStruct = Struct.new(:element_id, :element_type, :text, :metadata, keyword_init: true) do
169
+ def to_h
170
+ {
171
+ element_id: element_id,
172
+ element_type: element_type,
173
+ text: text,
174
+ metadata: metadata&.to_h
175
+ }
176
+ end
177
+ end
178
+
117
179
  # Initialize from native hash result
118
180
  #
119
181
  # @param hash [Hash] Hash returned from native extension
@@ -128,6 +190,7 @@ module Kreuzberg
128
190
  @chunks = parse_chunks(get_value(hash, 'chunks'))
129
191
  @images = parse_images(get_value(hash, 'images'))
130
192
  @pages = parse_pages(get_value(hash, 'pages'))
193
+ @elements = parse_elements(get_value(hash, 'elements'))
131
194
  end
132
195
 
133
196
  # Convert to hash
@@ -143,7 +206,8 @@ module Kreuzberg
143
206
  detected_languages: @detected_languages,
144
207
  chunks: serialize_chunks,
145
208
  images: serialize_images,
146
- pages: serialize_pages
209
+ pages: serialize_pages,
210
+ elements: serialize_elements
147
211
  }
148
212
  end
149
213
 
@@ -249,6 +313,10 @@ module Kreuzberg
249
313
  @pages&.map(&:to_h)
250
314
  end
251
315
 
316
+ def serialize_elements
317
+ @elements&.map(&:to_h)
318
+ end
319
+
252
320
  def get_value(hash, key, default = nil)
253
321
  hash[key] || hash[key.to_sym] || default
254
322
  end
@@ -329,6 +397,43 @@ module Kreuzberg
329
397
  )
330
398
  end
331
399
  end
400
+
401
+ def parse_elements(elements_data)
402
+ return nil if elements_data.nil?
403
+
404
+ elements_data.map { |element_hash| parse_element(element_hash) }
405
+ end
406
+
407
+ def parse_element(element_hash)
408
+ metadata_hash = element_hash['metadata'] || {}
409
+ coordinates = parse_element_coordinates(metadata_hash['coordinates'])
410
+
411
+ metadata = ElementMetadataStruct.new(
412
+ page_number: metadata_hash['page_number'],
413
+ filename: metadata_hash['filename'],
414
+ coordinates: coordinates,
415
+ element_index: metadata_hash['element_index'],
416
+ additional: metadata_hash['additional'] || {}
417
+ )
418
+
419
+ ElementStruct.new(
420
+ element_id: element_hash['element_id'],
421
+ element_type: element_hash['element_type'],
422
+ text: element_hash['text'],
423
+ metadata: metadata
424
+ )
425
+ end
426
+
427
+ def parse_element_coordinates(coordinates_data)
428
+ return nil if coordinates_data.nil?
429
+
430
+ ElementBoundingBox.new(
431
+ x0: coordinates_data['x0'].to_f,
432
+ y0: coordinates_data['y0'].to_f,
433
+ x1: coordinates_data['x1'].to_f,
434
+ y1: coordinates_data['y1'].to_f
435
+ )
436
+ end
332
437
  end
333
438
  # rubocop:enable Metrics/ClassLength
334
439
  end