kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,326 @@
1
+ //! ExtractionResult conversion to Ruby values
2
+ //!
3
+ //! Handles conversion of Kreuzberg ExtractionResult to Ruby Hash,
4
+ //! including complex nested structures like chunks, images, tables, and elements.
5
+
6
+ use crate::error_handling::runtime_error;
7
+ use crate::helpers::{json_value_to_ruby, set_hash_entry};
8
+
9
+ use kreuzberg::ExtractionResult as RustExtractionResult;
10
+ use magnus::{Error, RHash, Ruby, IntoValue};
11
+ use magnus::value::ReprValue;
12
+
13
+ /// Convert Kreuzberg ExtractionResult to Ruby Hash
14
+ ///
15
+ /// Converts the Rust extraction result into a Ruby hash with all fields including:
16
+ /// - content, mime_type, metadata
17
+ /// - tables (with cells and markdown)
18
+ /// - detected_languages
19
+ /// - chunks (with embeddings)
20
+ /// - images (including OCR results)
21
+ /// - pages (with per-page content)
22
+ /// - elements (for element-based format)
23
+ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Result<RHash, Error> {
24
+ let hash = ruby.hash_new();
25
+
26
+ // Set content and MIME type
27
+ let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
28
+ set_hash_entry(ruby, &hash, "content", content_value)?;
29
+
30
+ let mime_value = ruby.str_new(result.mime_type.as_str()).into_value_with(ruby);
31
+ set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
32
+
33
+ // Set metadata both as JSON string and parsed hash
34
+ let metadata_json = serde_json::to_string(&result.metadata)
35
+ .map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
36
+ let metadata_json_value = ruby.str_new(&metadata_json).into_value_with(ruby);
37
+ set_hash_entry(ruby, &hash, "metadata_json", metadata_json_value)?;
38
+ let metadata_value = serde_json::to_value(&result.metadata)
39
+ .map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
40
+ let metadata_hash = json_value_to_ruby(ruby, &metadata_value)?;
41
+ set_hash_entry(ruby, &hash, "metadata", metadata_hash)?;
42
+
43
+ // Convert tables
44
+ let tables_array = ruby.ary_new();
45
+ for table in result.tables {
46
+ let table_hash = ruby.hash_new();
47
+
48
+ let cells_array = ruby.ary_new();
49
+ for row in table.cells {
50
+ let row_array = ruby.ary_from_vec(row);
51
+ cells_array.push(row_array)?;
52
+ }
53
+ table_hash.aset("cells", cells_array)?;
54
+ table_hash.aset("markdown", table.markdown)?;
55
+ table_hash.aset("page_number", table.page_number)?;
56
+
57
+ tables_array.push(table_hash)?;
58
+ }
59
+ let tables_value = tables_array.into_value_with(ruby);
60
+ set_hash_entry(ruby, &hash, "tables", tables_value)?;
61
+
62
+ // Convert detected languages
63
+ if let Some(langs) = result.detected_languages {
64
+ let langs_array = ruby.ary_from_vec(langs);
65
+ let langs_value = langs_array.into_value_with(ruby);
66
+ set_hash_entry(ruby, &hash, "detected_languages", langs_value)?;
67
+ } else {
68
+ set_hash_entry(ruby, &hash, "detected_languages", ruby.qnil().as_value())?;
69
+ }
70
+
71
+ // Convert chunks
72
+ if let Some(chunks) = result.chunks {
73
+ let chunks_array = ruby.ary_new();
74
+ for chunk in chunks {
75
+ let chunk_hash = ruby.hash_new();
76
+ chunk_hash.aset("content", chunk.content)?;
77
+ chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
78
+ chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
79
+ if let Some(token_count) = chunk.metadata.token_count {
80
+ chunk_hash.aset("token_count", token_count)?;
81
+ } else {
82
+ chunk_hash.aset("token_count", ruby.qnil().as_value())?;
83
+ }
84
+ chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
85
+ chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
86
+ if let Some(first_page) = chunk.metadata.first_page {
87
+ chunk_hash.aset("first_page", first_page as i64)?;
88
+ } else {
89
+ chunk_hash.aset("first_page", ruby.qnil().as_value())?;
90
+ }
91
+ if let Some(last_page) = chunk.metadata.last_page {
92
+ chunk_hash.aset("last_page", last_page as i64)?;
93
+ } else {
94
+ chunk_hash.aset("last_page", ruby.qnil().as_value())?;
95
+ }
96
+ if let Some(embedding) = chunk.embedding {
97
+ let embedding_array = ruby.ary_new();
98
+ for value in embedding {
99
+ embedding_array.push(ruby.float_from_f64(value as f64).into_value_with(ruby))?;
100
+ }
101
+ chunk_hash.aset("embedding", embedding_array)?;
102
+ } else {
103
+ chunk_hash.aset("embedding", ruby.qnil().as_value())?;
104
+ }
105
+ chunks_array.push(chunk_hash)?;
106
+ }
107
+ let chunks_value = chunks_array.into_value_with(ruby);
108
+ set_hash_entry(ruby, &hash, "chunks", chunks_value)?;
109
+ } else {
110
+ set_hash_entry(ruby, &hash, "chunks", ruby.qnil().as_value())?;
111
+ }
112
+
113
+ // Convert images
114
+ if let Some(images) = result.images {
115
+ let images_array = ruby.ary_new();
116
+ for image in images {
117
+ let image_hash = ruby.hash_new();
118
+ let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
119
+ image_hash.aset("data", data_value)?;
120
+ image_hash.aset("format", image.format)?;
121
+ image_hash.aset("image_index", image.image_index as i64)?;
122
+ if let Some(page) = image.page_number {
123
+ image_hash.aset("page_number", page as i64)?;
124
+ } else {
125
+ image_hash.aset("page_number", ruby.qnil().as_value())?;
126
+ }
127
+ if let Some(width) = image.width {
128
+ image_hash.aset("width", width as i64)?;
129
+ } else {
130
+ image_hash.aset("width", ruby.qnil().as_value())?;
131
+ }
132
+ if let Some(height) = image.height {
133
+ image_hash.aset("height", height as i64)?;
134
+ } else {
135
+ image_hash.aset("height", ruby.qnil().as_value())?;
136
+ }
137
+ if let Some(colorspace) = image.colorspace {
138
+ image_hash.aset("colorspace", colorspace)?;
139
+ } else {
140
+ image_hash.aset("colorspace", ruby.qnil().as_value())?;
141
+ }
142
+ if let Some(bits) = image.bits_per_component {
143
+ image_hash.aset("bits_per_component", bits as i64)?;
144
+ } else {
145
+ image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
146
+ }
147
+ image_hash.aset(
148
+ "is_mask",
149
+ if image.is_mask {
150
+ ruby.qtrue().as_value()
151
+ } else {
152
+ ruby.qfalse().as_value()
153
+ },
154
+ )?;
155
+ if let Some(description) = image.description {
156
+ image_hash.aset("description", description)?;
157
+ } else {
158
+ image_hash.aset("description", ruby.qnil().as_value())?;
159
+ }
160
+ if let Some(ocr_result) = image.ocr_result {
161
+ let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
162
+ image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
163
+ } else {
164
+ image_hash.aset("ocr_result", ruby.qnil().as_value())?;
165
+ }
166
+ images_array.push(image_hash)?;
167
+ }
168
+ set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
169
+ } else {
170
+ set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
171
+ }
172
+
173
+ // Convert pages
174
+ if let Some(page_content_list) = result.pages {
175
+ let pages_array = ruby.ary_new();
176
+ for page_content in page_content_list {
177
+ let page_hash = ruby.hash_new();
178
+ page_hash.aset("page_number", page_content.page_number as i64)?;
179
+ page_hash.aset("content", page_content.content)?;
180
+
181
+ let tables_array = ruby.ary_new();
182
+ for table in page_content.tables {
183
+ let table_hash = ruby.hash_new();
184
+
185
+ let cells_array = ruby.ary_new();
186
+ for row in table.cells.clone() {
187
+ let row_array = ruby.ary_from_vec(row);
188
+ cells_array.push(row_array)?;
189
+ }
190
+ table_hash.aset("cells", cells_array)?;
191
+ table_hash.aset("markdown", table.markdown.clone())?;
192
+ table_hash.aset("page_number", table.page_number as i64)?;
193
+
194
+ tables_array.push(table_hash)?;
195
+ }
196
+ page_hash.aset("tables", tables_array)?;
197
+
198
+ let images_array = ruby.ary_new();
199
+ for image in page_content.images {
200
+ let image_hash = ruby.hash_new();
201
+ let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
202
+ image_hash.aset("data", data_value)?;
203
+ image_hash.aset("format", image.format.clone())?;
204
+ image_hash.aset("image_index", image.image_index as i64)?;
205
+ if let Some(page) = image.page_number {
206
+ image_hash.aset("page_number", page as i64)?;
207
+ } else {
208
+ image_hash.aset("page_number", ruby.qnil().as_value())?;
209
+ }
210
+ if let Some(width) = image.width {
211
+ image_hash.aset("width", width as i64)?;
212
+ } else {
213
+ image_hash.aset("width", ruby.qnil().as_value())?;
214
+ }
215
+ if let Some(height) = image.height {
216
+ image_hash.aset("height", height as i64)?;
217
+ } else {
218
+ image_hash.aset("height", ruby.qnil().as_value())?;
219
+ }
220
+ if let Some(colorspace) = &image.colorspace {
221
+ image_hash.aset("colorspace", colorspace.clone())?;
222
+ } else {
223
+ image_hash.aset("colorspace", ruby.qnil().as_value())?;
224
+ }
225
+ if let Some(bits) = image.bits_per_component {
226
+ image_hash.aset("bits_per_component", bits as i64)?;
227
+ } else {
228
+ image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
229
+ }
230
+ image_hash.aset(
231
+ "is_mask",
232
+ if image.is_mask {
233
+ ruby.qtrue().as_value()
234
+ } else {
235
+ ruby.qfalse().as_value()
236
+ },
237
+ )?;
238
+ if let Some(description) = &image.description {
239
+ image_hash.aset("description", description.clone())?;
240
+ } else {
241
+ image_hash.aset("description", ruby.qnil().as_value())?;
242
+ }
243
+ if let Some(ocr_result) = &image.ocr_result {
244
+ let nested = extraction_result_to_ruby(ruby, (**ocr_result).clone())?;
245
+ image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
246
+ } else {
247
+ image_hash.aset("ocr_result", ruby.qnil().as_value())?;
248
+ }
249
+ images_array.push(image_hash)?;
250
+ }
251
+ page_hash.aset("images", images_array)?;
252
+
253
+ pages_array.push(page_hash)?;
254
+ }
255
+ set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
256
+ } else {
257
+ set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
258
+ }
259
+
260
+ // Convert elements (element-based format)
261
+ if let Some(elements_list) = result.elements {
262
+ let elements_array = ruby.ary_new();
263
+ for element in elements_list {
264
+ let element_hash = ruby.hash_new();
265
+ element_hash.aset("element_id", element.element_id.as_ref())?;
266
+
267
+ // Convert ElementType to snake_case string
268
+ use kreuzberg::types::ElementType as ET;
269
+ let element_type_str = match element.element_type {
270
+ ET::Title => "title",
271
+ ET::NarrativeText => "narrative_text",
272
+ ET::Heading => "heading",
273
+ ET::ListItem => "list_item",
274
+ ET::Table => "table",
275
+ ET::Image => "image",
276
+ ET::PageBreak => "page_break",
277
+ ET::CodeBlock => "code_block",
278
+ ET::BlockQuote => "block_quote",
279
+ ET::Footer => "footer",
280
+ ET::Header => "header",
281
+ };
282
+ element_hash.aset("element_type", element_type_str)?;
283
+ element_hash.aset("text", element.text)?;
284
+
285
+ let metadata_hash = ruby.hash_new();
286
+ if let Some(page_num) = element.metadata.page_number {
287
+ metadata_hash.aset("page_number", page_num as i64)?;
288
+ } else {
289
+ metadata_hash.aset("page_number", ruby.qnil().as_value())?;
290
+ }
291
+ if let Some(filename) = &element.metadata.filename {
292
+ metadata_hash.aset("filename", filename.as_str())?;
293
+ } else {
294
+ metadata_hash.aset("filename", ruby.qnil().as_value())?;
295
+ }
296
+ if let Some(coords) = element.metadata.coordinates {
297
+ let coords_hash = ruby.hash_new();
298
+ coords_hash.aset("x0", coords.x0)?;
299
+ coords_hash.aset("y0", coords.y0)?;
300
+ coords_hash.aset("x1", coords.x1)?;
301
+ coords_hash.aset("y1", coords.y1)?;
302
+ metadata_hash.aset("coordinates", coords_hash)?;
303
+ } else {
304
+ metadata_hash.aset("coordinates", ruby.qnil().as_value())?;
305
+ }
306
+ if let Some(elem_idx) = element.metadata.element_index {
307
+ metadata_hash.aset("element_index", elem_idx as i64)?;
308
+ } else {
309
+ metadata_hash.aset("element_index", ruby.qnil().as_value())?;
310
+ }
311
+ let additional_hash = ruby.hash_new();
312
+ for (key, value) in &element.metadata.additional {
313
+ additional_hash.aset(key.as_str(), value.as_str())?;
314
+ }
315
+ metadata_hash.aset("additional", additional_hash)?;
316
+
317
+ element_hash.aset("metadata", metadata_hash)?;
318
+ elements_array.push(element_hash)?;
319
+ }
320
+ set_hash_entry(ruby, &hash, "elements", elements_array.into_value_with(ruby))?;
321
+ } else {
322
+ set_hash_entry(ruby, &hash, "elements", ruby.qnil().as_value())?;
323
+ }
324
+
325
+ Ok(hash)
326
+ }
@@ -0,0 +1,4 @@
1
+ //! Validation functions for configuration and formats
2
+ //!
3
+ //! Provides validation for MIME types, formats, and other configuration parameters.
4
+ //! These validation functions are re-exported directly from lib.rs via kreuzberg_ffi.
@@ -905,6 +905,72 @@ module Kreuzberg
905
905
  self
906
906
  end
907
907
 
908
+ # Set a configuration field using hash-like syntax
909
+ #
910
+ # @param key [Symbol, String] Field name to set
911
+ # @param value [Object] Value to set
912
+ # @return [Object] The value that was set
913
+ #
914
+ # @example
915
+ # config = Extraction.new(use_cache: true)
916
+ # config[:use_cache] = false
917
+ # config[:force_ocr] = true
918
+ #
919
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
920
+ def []=(key, value)
921
+ key_sym = key.to_sym
922
+ case key_sym
923
+ when :use_cache
924
+ @use_cache = value ? true : false
925
+ when :enable_quality_processing
926
+ @enable_quality_processing = value ? true : false
927
+ when :force_ocr
928
+ @force_ocr = value ? true : false
929
+ when :ocr
930
+ @ocr = normalize_config(value, OCR)
931
+ when :chunking
932
+ @chunking = normalize_config(value, Chunking)
933
+ when :language_detection
934
+ @language_detection = normalize_config(value, LanguageDetection)
935
+ when :pdf_options
936
+ @pdf_options = normalize_config(value, PDF)
937
+ when :image_extraction
938
+ @image_extraction = normalize_config(value, ImageExtraction)
939
+ when :image_preprocessing
940
+ @image_preprocessing = normalize_config(value, ImagePreprocessing)
941
+ when :postprocessor
942
+ @postprocessor = normalize_config(value, PostProcessor)
943
+ when :token_reduction
944
+ @token_reduction = normalize_config(value, TokenReduction)
945
+ when :keywords
946
+ @keywords = normalize_config(value, Keywords)
947
+ when :html_options
948
+ @html_options = normalize_config(value, HtmlOptions)
949
+ when :pages
950
+ @pages = normalize_config(value, PageConfig)
951
+ when :max_concurrent_extractions
952
+ @max_concurrent_extractions = value&.to_i
953
+ else
954
+ raise ArgumentError, "Unknown configuration key: #{key}"
955
+ end
956
+ end
957
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
958
+
959
+ # Get a configuration field using hash-like syntax
960
+ #
961
+ # @param key [Symbol, String] Field name to get
962
+ # @return [Object, nil] The field value
963
+ #
964
+ # @example
965
+ # config = Extraction.new(use_cache: true)
966
+ # config[:use_cache] # => true
967
+ #
968
+ def [](key)
969
+ send(key.to_sym)
970
+ rescue NoMethodError
971
+ nil
972
+ end
973
+
908
974
  private
909
975
 
910
976
  def normalize_config(value, klass)
@@ -11,7 +11,7 @@ module Kreuzberg
11
11
  # rubocop:disable Metrics/ClassLength
12
12
  class Result
13
13
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
14
- :detected_languages, :chunks, :images, :pages
14
+ :detected_languages, :chunks, :images, :pages, :elements
15
15
 
16
16
  # @!attribute [r] cells
17
17
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -114,6 +114,68 @@ module Kreuzberg
114
114
  end
115
115
  end
116
116
 
117
+ # @!attribute [r] x0
118
+ # @return [Float] Left x-coordinate
119
+ # @!attribute [r] y0
120
+ # @return [Float] Bottom y-coordinate
121
+ # @!attribute [r] x1
122
+ # @return [Float] Right x-coordinate
123
+ # @!attribute [r] y1
124
+ # @return [Float] Top y-coordinate
125
+ ElementBoundingBox = Struct.new(:x0, :y0, :x1, :y1, keyword_init: true) do
126
+ def to_h
127
+ { x0: x0, y0: y0, x1: x1, y1: y1 }
128
+ end
129
+ end
130
+
131
+ # @!attribute [r] page_number
132
+ # @return [Integer, nil] Page number (1-indexed)
133
+ # @!attribute [r] filename
134
+ # @return [String, nil] Source filename or document name
135
+ # @!attribute [r] coordinates
136
+ # @return [ElementBoundingBox, nil] Bounding box coordinates if available
137
+ # @!attribute [r] element_index
138
+ # @return [Integer, nil] Position index in the element sequence
139
+ # @!attribute [r] additional
140
+ # @return [Hash<String, String>] Additional custom metadata
141
+ ElementMetadataStruct = Struct.new(
142
+ :page_number,
143
+ :filename,
144
+ :coordinates,
145
+ :element_index,
146
+ :additional,
147
+ keyword_init: true
148
+ ) do
149
+ def to_h
150
+ {
151
+ page_number: page_number,
152
+ filename: filename,
153
+ coordinates: coordinates&.to_h,
154
+ element_index: element_index,
155
+ additional: additional
156
+ }
157
+ end
158
+ end
159
+
160
+ # @!attribute [r] element_id
161
+ # @return [String] Unique element identifier
162
+ # @!attribute [r] element_type
163
+ # @return [String] Semantic type of the element
164
+ # @!attribute [r] text
165
+ # @return [String] Text content of the element
166
+ # @!attribute [r] metadata
167
+ # @return [ElementMetadataStruct] Metadata about the element
168
+ ElementStruct = Struct.new(:element_id, :element_type, :text, :metadata, keyword_init: true) do
169
+ def to_h
170
+ {
171
+ element_id: element_id,
172
+ element_type: element_type,
173
+ text: text,
174
+ metadata: metadata&.to_h
175
+ }
176
+ end
177
+ end
178
+
117
179
  # Initialize from native hash result
118
180
  #
119
181
  # @param hash [Hash] Hash returned from native extension
@@ -128,6 +190,7 @@ module Kreuzberg
128
190
  @chunks = parse_chunks(get_value(hash, 'chunks'))
129
191
  @images = parse_images(get_value(hash, 'images'))
130
192
  @pages = parse_pages(get_value(hash, 'pages'))
193
+ @elements = parse_elements(get_value(hash, 'elements'))
131
194
  end
132
195
 
133
196
  # Convert to hash
@@ -143,7 +206,8 @@ module Kreuzberg
143
206
  detected_languages: @detected_languages,
144
207
  chunks: serialize_chunks,
145
208
  images: serialize_images,
146
- pages: serialize_pages
209
+ pages: serialize_pages,
210
+ elements: serialize_elements
147
211
  }
148
212
  end
149
213
 
@@ -249,6 +313,10 @@ module Kreuzberg
249
313
  @pages&.map(&:to_h)
250
314
  end
251
315
 
316
+ def serialize_elements
317
+ @elements&.map(&:to_h)
318
+ end
319
+
252
320
  def get_value(hash, key, default = nil)
253
321
  hash[key] || hash[key.to_sym] || default
254
322
  end
@@ -329,6 +397,43 @@ module Kreuzberg
329
397
  )
330
398
  end
331
399
  end
400
+
401
+ def parse_elements(elements_data)
402
+ return nil if elements_data.nil?
403
+
404
+ elements_data.map { |element_hash| parse_element(element_hash) }
405
+ end
406
+
407
+ def parse_element(element_hash)
408
+ metadata_hash = element_hash['metadata'] || {}
409
+ coordinates = parse_element_coordinates(metadata_hash['coordinates'])
410
+
411
+ metadata = ElementMetadataStruct.new(
412
+ page_number: metadata_hash['page_number'],
413
+ filename: metadata_hash['filename'],
414
+ coordinates: coordinates,
415
+ element_index: metadata_hash['element_index'],
416
+ additional: metadata_hash['additional'] || {}
417
+ )
418
+
419
+ ElementStruct.new(
420
+ element_id: element_hash['element_id'],
421
+ element_type: element_hash['element_type'],
422
+ text: element_hash['text'],
423
+ metadata: metadata
424
+ )
425
+ end
426
+
427
+ def parse_element_coordinates(coordinates_data)
428
+ return nil if coordinates_data.nil?
429
+
430
+ ElementBoundingBox.new(
431
+ x0: coordinates_data['x0'].to_f,
432
+ y0: coordinates_data['y0'].to_f,
433
+ x1: coordinates_data['x1'].to_f,
434
+ y1: coordinates_data['y1'].to_f
435
+ )
436
+ end
332
437
  end
333
438
  # rubocop:enable Metrics/ClassLength
334
439
  end
@@ -3,6 +3,110 @@
3
3
  require 'sorbet-runtime'
4
4
 
5
5
  module Kreuzberg
6
+ # Semantic element type classification.
7
+ #
8
+ # Categorizes text content into semantic units for downstream processing.
9
+ # Supports the element types commonly found in Unstructured documents.
10
+ #
11
+ # @example
12
+ # type = Kreuzberg::ElementType::TITLE
13
+ #
14
+ ElementType = T.type_alias do
15
+ T.any(
16
+ 'title',
17
+ 'narrative_text',
18
+ 'heading',
19
+ 'list_item',
20
+ 'table',
21
+ 'image',
22
+ 'page_break',
23
+ 'code_block',
24
+ 'block_quote',
25
+ 'footer',
26
+ 'header'
27
+ )
28
+ end
29
+
30
+ # Bounding box coordinates for element positioning.
31
+ #
32
+ # Represents rectangular coordinates for an element within a page.
33
+ #
34
+ # @example
35
+ # bbox = Kreuzberg::BoundingBox.new(
36
+ # x0: 10.0,
37
+ # y0: 20.0,
38
+ # x1: 100.0,
39
+ # y1: 50.0
40
+ # )
41
+ # puts "Width: #{bbox.x1 - bbox.x0}"
42
+ #
43
+ class BoundingBox < T::Struct
44
+ extend T::Sig
45
+
46
+ const :x0, Float
47
+
48
+ const :y0, Float
49
+
50
+ const :x1, Float
51
+
52
+ const :y1, Float
53
+ end
54
+
55
+ # Metadata for a semantic element.
56
+ #
57
+ # Provides contextual information about an extracted element including
58
+ # its position within the document and custom metadata fields.
59
+ #
60
+ # @example
61
+ # metadata = Kreuzberg::ElementMetadata.new(
62
+ # page_number: 1,
63
+ # filename: "document.pdf",
64
+ # coordinates: bbox,
65
+ # element_index: 5,
66
+ # additional: { "style" => "bold" }
67
+ # )
68
+ #
69
+ class ElementMetadata < T::Struct
70
+ extend T::Sig
71
+
72
+ const :page_number, T.nilable(Integer)
73
+
74
+ const :filename, T.nilable(String)
75
+
76
+ const :coordinates, T.nilable(BoundingBox)
77
+
78
+ const :element_index, T.nilable(Integer)
79
+
80
+ const :additional, T::Hash[String, String]
81
+ end
82
+
83
+ # Semantic element extracted from document.
84
+ #
85
+ # Represents a logical unit of content with semantic classification,
86
+ # unique identifier, and metadata for tracking origin and position.
87
+ # Compatible with Unstructured.io element format when output_format='element_based'.
88
+ #
89
+ # @example
90
+ # element = Kreuzberg::Element.new(
91
+ # element_id: "elem-abc123",
92
+ # element_type: "narrative_text",
93
+ # text: "This is the main content.",
94
+ # metadata: metadata
95
+ # )
96
+ # puts "#{element.element_type}: #{element.text}"
97
+ #
98
+ class Element < T::Struct
99
+ extend T::Sig
100
+
101
+ const :element_id, String
102
+
103
+ const :element_type, String
104
+
105
+ const :text, String
106
+
107
+ const :metadata, ElementMetadata
108
+ end
109
+
6
110
  # Header/Heading metadata
7
111
  #
8
112
  # Represents a heading element found in the HTML document
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.8'
4
+ VERSION = '4.1.0'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -83,10 +83,6 @@ module Kreuzberg
83
83
  module_function :validate_mime_type
84
84
 
85
85
  module_function :get_extensions_for_mime
86
-
87
- module_function :list_embedding_presets
88
-
89
- module_function :get_embedding_preset
90
86
  end
91
87
 
92
88
  require_relative 'kreuzberg/cache_api'