kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,211 @@
1
+ //! Element generation and list detection utilities.
2
+ //!
3
+ //! This module provides functions for detecting semantic elements in text,
4
+ //! including list items, and generating unique element IDs.
5
+
6
+ use crate::types::{Element, ElementId, ElementMetadata, ElementType};
7
+ use std::collections::HashMap;
8
+
9
+ use super::types::{ListItemMetadata, ListType};
10
+
11
+ /// Detect list items in text with support for multiple formats.
12
+ ///
13
+ /// Identifies bullet points, numbered items, and indented items.
14
+ /// Supports formats like:
15
+ /// - `- bullet item`
16
+ /// - `* bullet item`
17
+ /// - `• bullet item`
18
+ /// - `1. numbered item`
19
+ /// - `a. lettered item`
20
+ /// - Indented items with leading whitespace
21
+ ///
22
+ /// # Arguments
23
+ ///
24
+ /// * `text` - The text to search for list items
25
+ ///
26
+ /// # Returns
27
+ ///
28
+ /// A vector of ListItemMetadata structs describing detected list items
29
+ pub fn detect_list_items(text: &str) -> Vec<ListItemMetadata> {
30
+ let mut items = Vec::new();
31
+ let lines: Vec<&str> = text.lines().collect();
32
+
33
+ let mut current_byte_offset = 0;
34
+
35
+ for line in lines {
36
+ let line_start_offset = current_byte_offset;
37
+ let trimmed = line.trim_start();
38
+ let indent_level = (line.len() - trimmed.len()) / 2; // Estimate indent level
39
+
40
+ // Check for bullet points
41
+ if let Some(stripped) = trimmed.strip_prefix('-')
42
+ && (stripped.starts_with(' ') || stripped.is_empty())
43
+ {
44
+ let byte_end = line_start_offset + line.len();
45
+ items.push(ListItemMetadata {
46
+ list_type: ListType::Bullet,
47
+ byte_start: line_start_offset,
48
+ byte_end,
49
+ indent_level: indent_level as u32,
50
+ });
51
+ current_byte_offset = byte_end + 1; // +1 for newline
52
+ continue;
53
+ }
54
+
55
+ if let Some(stripped) = trimmed.strip_prefix('*')
56
+ && (stripped.starts_with(' ') || stripped.is_empty())
57
+ {
58
+ let byte_end = line_start_offset + line.len();
59
+ items.push(ListItemMetadata {
60
+ list_type: ListType::Bullet,
61
+ byte_start: line_start_offset,
62
+ byte_end,
63
+ indent_level: indent_level as u32,
64
+ });
65
+ current_byte_offset = byte_end + 1;
66
+ continue;
67
+ }
68
+
69
+ if let Some(stripped) = trimmed.strip_prefix('•')
70
+ && (stripped.starts_with(' ') || stripped.is_empty())
71
+ {
72
+ let byte_end = line_start_offset + line.len();
73
+ items.push(ListItemMetadata {
74
+ list_type: ListType::Bullet,
75
+ byte_start: line_start_offset,
76
+ byte_end,
77
+ indent_level: indent_level as u32,
78
+ });
79
+ current_byte_offset = byte_end + 1;
80
+ continue;
81
+ }
82
+
83
+ // Check for numbered lists (e.g., "1.", "2.", etc.)
84
+ if let Some(pos) = trimmed.find('.') {
85
+ let prefix = &trimmed[..pos];
86
+ if prefix.chars().all(|c| c.is_ascii_digit())
87
+ && pos > 0
88
+ && pos < 3
89
+ && trimmed.len() > pos + 1
90
+ && trimmed[pos + 1..].starts_with(' ')
91
+ {
92
+ let byte_end = line_start_offset + line.len();
93
+ items.push(ListItemMetadata {
94
+ list_type: ListType::Numbered,
95
+ byte_start: line_start_offset,
96
+ byte_end,
97
+ indent_level: indent_level as u32,
98
+ });
99
+ current_byte_offset = byte_end + 1;
100
+ continue;
101
+ }
102
+ }
103
+
104
+ // Check for lettered lists (e.g., "a.", "b.", "A.", "B.")
105
+ if let Some(pos) = trimmed.find('.') {
106
+ let prefix = &trimmed[..pos];
107
+ if prefix.len() == 1
108
+ && prefix.chars().all(|c| c.is_alphabetic())
109
+ && pos > 0
110
+ && trimmed.len() > pos + 1
111
+ && trimmed[pos + 1..].starts_with(' ')
112
+ {
113
+ let byte_end = line_start_offset + line.len();
114
+ items.push(ListItemMetadata {
115
+ list_type: ListType::Lettered,
116
+ byte_start: line_start_offset,
117
+ byte_end,
118
+ indent_level: indent_level as u32,
119
+ });
120
+ current_byte_offset = byte_end + 1;
121
+ continue;
122
+ }
123
+ }
124
+
125
+ // Check for indented items (more than 4 spaces)
126
+ if indent_level >= 2 && !trimmed.is_empty() {
127
+ let byte_end = line_start_offset + line.len();
128
+ items.push(ListItemMetadata {
129
+ list_type: ListType::Indented,
130
+ byte_start: line_start_offset,
131
+ byte_end,
132
+ indent_level: indent_level as u32,
133
+ });
134
+ current_byte_offset = byte_end + 1;
135
+ continue;
136
+ }
137
+
138
+ current_byte_offset = line_start_offset + line.len() + 1; // +1 for newline
139
+ }
140
+
141
+ items
142
+ }
143
+
144
+ /// Generate a unique element ID for semantic content.
145
+ ///
146
+ /// Creates a deterministic hash-based ID from the element type, text content,
147
+ /// and page number. Uses a simple wrapping multiplication algorithm for
148
+ /// consistent ID generation without external dependencies.
149
+ ///
150
+ /// # Arguments
151
+ ///
152
+ /// * `text` - The element text content
153
+ /// * `element_type` - The semantic element type
154
+ /// * `page_number` - Optional page number for multi-page documents
155
+ ///
156
+ /// # Returns
157
+ ///
158
+ /// An ElementId suitable for referencing this semantic element
159
+ pub fn generate_element_id(text: &str, element_type: ElementType, page_number: Option<usize>) -> ElementId {
160
+ // Simple deterministic hash using wrapping multiplication
161
+ let type_hash = format!("{:?}", element_type)
162
+ .bytes()
163
+ .fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
164
+
165
+ let text_hash = text
166
+ .bytes()
167
+ .fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
168
+
169
+ let page_hash = page_number
170
+ .unwrap_or(1)
171
+ .to_string()
172
+ .bytes()
173
+ .fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
174
+
175
+ let combined = type_hash
176
+ .wrapping_mul(65599)
177
+ .wrapping_add(text_hash)
178
+ .wrapping_mul(65599)
179
+ .wrapping_add(page_hash);
180
+
181
+ ElementId::new(format!("elem-{:x}", combined)).expect("ElementId creation failed")
182
+ }
183
+
184
+ /// Add paragraphs as NarrativeText elements, splitting on double newlines.
185
+ pub(super) fn add_paragraphs(elements: &mut Vec<Element>, text: &str, page_number: usize, title: &Option<String>) {
186
+ if text.is_empty() {
187
+ return;
188
+ }
189
+
190
+ // Split on double newlines to detect paragraph boundaries
191
+ for paragraph in text.split("\n\n").filter(|p| !p.trim().is_empty()) {
192
+ let para_text = paragraph.trim();
193
+ if para_text.is_empty() {
194
+ continue;
195
+ }
196
+
197
+ let element_id = generate_element_id(para_text, ElementType::NarrativeText, Some(page_number));
198
+ elements.push(Element {
199
+ element_id,
200
+ element_type: ElementType::NarrativeText,
201
+ text: para_text.to_string(),
202
+ metadata: ElementMetadata {
203
+ page_number: Some(page_number),
204
+ filename: title.clone(),
205
+ coordinates: None,
206
+ element_index: Some(elements.len()),
207
+ additional: HashMap::new(),
208
+ },
209
+ });
210
+ }
211
+ }
@@ -0,0 +1,480 @@
1
+ //! Transformation utilities for converting extraction results into semantic elements.
2
+ //!
3
+ //! This module provides post-processing functions to transform raw extraction results
4
+ //! into element-based output format, suitable for downstream processing and analysis.
5
+ //! Key functionality includes:
6
+ //!
7
+ //! - Semantic element generation from text content
8
+ //! - List item detection with support for multiple formats
9
+ //! - PageBreak interleaving with reverse byte-order processing
10
+ //! - Safe bounds checking for text ranges
11
+
12
+ mod content;
13
+ mod elements;
14
+ mod types;
15
+
16
+ // Re-export public API
17
+ pub use elements::{detect_list_items, generate_element_id};
18
+ pub use types::{ListItemMetadata, ListType};
19
+
20
+ use crate::types::{Element, ExtractionResult};
21
+ use content::{
22
+ add_page_break, format_table_as_text, process_content, process_hierarchy, process_images, process_tables,
23
+ };
24
+
25
+ /// Transform an extraction result into semantic elements.
26
+ ///
27
+ /// This function takes a reference to an ExtractionResult and generates
28
+ /// a vector of Element structs representing semantic blocks in the document.
29
+ /// It detects content sections, list items, page breaks, and other structural
30
+ /// elements to create an Unstructured-compatible element-based output.
31
+ ///
32
+ /// Handles:
33
+ /// - PDF hierarchy → Title/Heading elements
34
+ /// - Multi-page documents with correct page numbers
35
+ /// - Table and Image extraction
36
+ /// - PageBreak interleaving
37
+ /// - Bounding box coordinates
38
+ /// - Paragraph detection for NarrativeText
39
+ ///
40
+ /// # Arguments
41
+ ///
42
+ /// * `result` - Reference to the ExtractionResult to transform
43
+ ///
44
+ /// # Returns
45
+ ///
46
+ /// A vector of Elements with proper semantic types and metadata.
47
+ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec<Element> {
48
+ let mut elements = Vec::new();
49
+
50
+ // If pages are available, process per-page with hierarchy, tables, images
51
+ if let Some(ref pages) = result.pages {
52
+ for page in pages {
53
+ let page_number = page.page_number;
54
+
55
+ // 1. Process hierarchy blocks (PDF headings)
56
+ if let Some(ref hierarchy) = page.hierarchy {
57
+ process_hierarchy(&mut elements, hierarchy, page_number, &result.metadata.title);
58
+ }
59
+
60
+ // 2. Process tables on this page
61
+ process_tables(&mut elements, &page.tables, page_number, &result.metadata.title);
62
+
63
+ // 3. Process images on this page
64
+ process_images(&mut elements, &page.images, page_number, &result.metadata.title);
65
+
66
+ // 4. Process page content (body text, list items, paragraphs)
67
+ process_content(&mut elements, &page.content, page_number, &result.metadata.title);
68
+
69
+ // 5. Add PageBreak after each page (except the last)
70
+ if page_number < pages.len() {
71
+ add_page_break(&mut elements, page_number, page_number + 1, &result.metadata.title);
72
+ }
73
+ }
74
+ } else {
75
+ // Fallback: No pages, process unified content with page 1
76
+ process_content(&mut elements, &result.content, 1, &result.metadata.title);
77
+
78
+ // Process global tables (if any)
79
+ for table in &result.tables {
80
+ let table_text = format_table_as_text(table);
81
+ let element_id = elements::generate_element_id(&table_text, crate::types::ElementType::Table, Some(1));
82
+ elements.push(Element {
83
+ element_id,
84
+ element_type: crate::types::ElementType::Table,
85
+ text: table_text,
86
+ metadata: crate::types::ElementMetadata {
87
+ page_number: Some(1),
88
+ filename: result.metadata.title.clone(),
89
+ coordinates: None,
90
+ element_index: Some(elements.len()),
91
+ additional: std::collections::HashMap::new(),
92
+ },
93
+ });
94
+ }
95
+
96
+ // Process global images (if any)
97
+ if let Some(ref images) = result.images {
98
+ for image in images {
99
+ let image_text = format!(
100
+ "Image: {} ({}x{})",
101
+ image.format,
102
+ image.width.unwrap_or(0),
103
+ image.height.unwrap_or(0)
104
+ );
105
+ let page_num = image.page_number.unwrap_or(1);
106
+
107
+ let element_id =
108
+ elements::generate_element_id(&image_text, crate::types::ElementType::Image, Some(page_num));
109
+ elements.push(Element {
110
+ element_id,
111
+ element_type: crate::types::ElementType::Image,
112
+ text: image_text,
113
+ metadata: crate::types::ElementMetadata {
114
+ page_number: Some(page_num),
115
+ filename: result.metadata.title.clone(),
116
+ coordinates: None,
117
+ element_index: Some(elements.len()),
118
+ additional: {
119
+ let mut m = std::collections::HashMap::new();
120
+ m.insert("format".to_string(), image.format.clone());
121
+ if let Some(width) = image.width {
122
+ m.insert("width".to_string(), width.to_string());
123
+ }
124
+ if let Some(height) = image.height {
125
+ m.insert("height".to_string(), height.to_string());
126
+ }
127
+ m
128
+ },
129
+ },
130
+ });
131
+ }
132
+ }
133
+ }
134
+
135
+ elements
136
+ }
137
+
138
+ #[cfg(test)]
139
+ mod tests {
140
+ use super::*;
141
+
142
+ #[test]
143
+ fn test_detect_bullet_items() {
144
+ let text = "- First item\n- Second item\n- Third item";
145
+ let items = detect_list_items(text);
146
+ assert_eq!(items.len(), 3);
147
+ assert_eq!(items[0].list_type, ListType::Bullet);
148
+ assert_eq!(items[1].list_type, ListType::Bullet);
149
+ assert_eq!(items[2].list_type, ListType::Bullet);
150
+ }
151
+
152
+ #[test]
153
+ fn test_detect_numbered_items() {
154
+ let text = "1. First\n2. Second\n3. Third";
155
+ let items = detect_list_items(text);
156
+ assert_eq!(items.len(), 3);
157
+ assert!(items.iter().all(|i| i.list_type == ListType::Numbered));
158
+ }
159
+
160
+ #[test]
161
+ fn test_detect_lettered_items() {
162
+ let text = "a. First\nb. Second\nc. Third";
163
+ let items = detect_list_items(text);
164
+ assert_eq!(items.len(), 3);
165
+ assert!(items.iter().all(|i| i.list_type == ListType::Lettered));
166
+ }
167
+
168
+ #[test]
169
+ fn test_detect_mixed_items() {
170
+ let text = "Some text\n- Bullet\n1. Numbered\nMore text";
171
+ let items = detect_list_items(text);
172
+ assert_eq!(items.len(), 2);
173
+ assert_eq!(items[0].list_type, ListType::Bullet);
174
+ assert_eq!(items[1].list_type, ListType::Numbered);
175
+ }
176
+
177
+ #[test]
178
+ fn test_element_id_generation() {
179
+ use crate::types::ElementType;
180
+ let id1 = generate_element_id("test", ElementType::Title, Some(1));
181
+ let id2 = generate_element_id("test", ElementType::Title, Some(1));
182
+ assert_eq!(id1.as_ref(), id2.as_ref());
183
+
184
+ let id3 = generate_element_id("different", ElementType::Title, Some(1));
185
+ assert_ne!(id1.as_ref(), id3.as_ref());
186
+ }
187
+
188
+ #[test]
189
+ fn test_page_break_interleaving_reverse_order() {
190
+ // Test that page breaks are processed in reverse byte order
191
+ let page_breaks = vec![(100, "page_break_1"), (50, "page_break_2"), (75, "page_break_3")];
192
+
193
+ // Sort in descending order by byte offset
194
+ let mut sorted = page_breaks.clone();
195
+ sorted.sort_by(|(offset_a, _), (offset_b, _)| offset_b.cmp(offset_a));
196
+
197
+ // Verify reverse order: 100, 75, 50
198
+ assert_eq!(sorted[0].0, 100);
199
+ assert_eq!(sorted[1].0, 75);
200
+ assert_eq!(sorted[2].0, 50);
201
+ }
202
+
203
+ #[test]
204
+ fn test_bounds_checking() {
205
+ let text = "Hello world";
206
+
207
+ // Valid range
208
+ let valid_item = ListItemMetadata {
209
+ list_type: ListType::Bullet,
210
+ byte_start: 0,
211
+ byte_end: 5,
212
+ indent_level: 0,
213
+ };
214
+ assert!(valid_item.byte_start <= text.len());
215
+ assert!(valid_item.byte_end <= text.len());
216
+ assert!(valid_item.byte_start <= valid_item.byte_end);
217
+
218
+ // Invalid: end beyond string
219
+ let invalid_item = ListItemMetadata {
220
+ list_type: ListType::Bullet,
221
+ byte_start: 0,
222
+ byte_end: 100,
223
+ indent_level: 0,
224
+ };
225
+ assert!(invalid_item.byte_end > text.len());
226
+ }
227
+
228
+ #[test]
229
+ fn test_indent_level_detection() {
230
+ let text = " - Indented item";
231
+ let items = detect_list_items(text);
232
+ assert_eq!(items.len(), 1);
233
+ assert!(items[0].indent_level >= 1);
234
+ }
235
+
236
+ // Helper to create minimal Metadata for tests
237
+ fn test_metadata(title: Option<String>) -> crate::types::Metadata {
238
+ crate::types::Metadata {
239
+ title,
240
+ subject: None,
241
+ authors: None,
242
+ keywords: None,
243
+ language: None,
244
+ created_at: None,
245
+ modified_at: None,
246
+ created_by: None,
247
+ modified_by: None,
248
+ pages: None,
249
+ format: None,
250
+ image_preprocessing: None,
251
+ json_schema: None,
252
+ error: None,
253
+ additional: Default::default(),
254
+ }
255
+ }
256
+
257
+ // Integration tests for full transformation
258
+ #[test]
259
+ fn test_transform_with_pages_and_hierarchy() {
260
+ use crate::types::{ElementType, ExtractionResult, HierarchicalBlock, PageContent, PageHierarchy};
261
+
262
+ // Create a mock result with pages and hierarchy
263
+ let result = ExtractionResult {
264
+ content: "Full document content".to_string(),
265
+ mime_type: "application/pdf".to_string(),
266
+ metadata: test_metadata(Some("Test Document".to_string())),
267
+ tables: vec![],
268
+ detected_languages: None,
269
+ chunks: None,
270
+ images: None,
271
+ djot_content: None,
272
+ pages: Some(vec![
273
+ PageContent {
274
+ page_number: 1,
275
+ content: "This is a test paragraph.\n\nAnother paragraph here.".to_string(),
276
+ tables: vec![],
277
+ images: vec![],
278
+ hierarchy: Some(PageHierarchy {
279
+ block_count: 2,
280
+ blocks: vec![
281
+ HierarchicalBlock {
282
+ text: "Main Title".to_string(),
283
+ font_size: 24.0,
284
+ level: "h1".to_string(),
285
+ bbox: Some((10.0, 20.0, 100.0, 50.0)),
286
+ },
287
+ HierarchicalBlock {
288
+ text: "Subtitle".to_string(),
289
+ font_size: 16.0,
290
+ level: "h2".to_string(),
291
+ bbox: Some((10.0, 60.0, 100.0, 80.0)),
292
+ },
293
+ ],
294
+ }),
295
+ },
296
+ PageContent {
297
+ page_number: 2,
298
+ content: "- List item 1\n- List item 2".to_string(),
299
+ tables: vec![],
300
+ images: vec![],
301
+ hierarchy: None,
302
+ },
303
+ ]),
304
+ elements: None,
305
+ };
306
+
307
+ let elements = transform_extraction_result_to_elements(&result);
308
+
309
+ // Verify we have elements
310
+ assert!(!elements.is_empty());
311
+
312
+ // Find Title elements from hierarchy
313
+ let titles: Vec<_> = elements
314
+ .iter()
315
+ .filter(|e| e.element_type == ElementType::Title)
316
+ .collect();
317
+ assert_eq!(titles.len(), 2, "Should have 2 title elements from hierarchy");
318
+ assert_eq!(titles[0].text, "Main Title");
319
+ assert_eq!(titles[1].text, "Subtitle");
320
+
321
+ // Verify page numbers
322
+ assert_eq!(titles[0].metadata.page_number, Some(1));
323
+ assert_eq!(titles[1].metadata.page_number, Some(1));
324
+
325
+ // Verify coordinates were extracted
326
+ assert!(titles[0].metadata.coordinates.is_some());
327
+ assert!(titles[1].metadata.coordinates.is_some());
328
+
329
+ // Find list items
330
+ let list_items: Vec<_> = elements
331
+ .iter()
332
+ .filter(|e| e.element_type == ElementType::ListItem)
333
+ .collect();
334
+ assert_eq!(list_items.len(), 2, "Should have 2 list items");
335
+ assert_eq!(list_items[0].metadata.page_number, Some(2));
336
+ assert_eq!(list_items[1].metadata.page_number, Some(2));
337
+
338
+ // Find PageBreak
339
+ let page_breaks: Vec<_> = elements
340
+ .iter()
341
+ .filter(|e| e.element_type == ElementType::PageBreak)
342
+ .collect();
343
+ assert_eq!(page_breaks.len(), 1, "Should have 1 page break between pages");
344
+ }
345
+
346
+ #[test]
347
+ fn test_transform_with_tables_and_images() {
348
+ use crate::types::{ExtractedImage, ExtractionResult, PageContent, Table};
349
+ use std::sync::Arc;
350
+
351
+ let table = Table {
352
+ cells: vec![
353
+ vec!["Header1".to_string(), "Header2".to_string()],
354
+ vec!["Cell1".to_string(), "Cell2".to_string()],
355
+ ],
356
+ markdown: "| Header1 | Header2 |\n| Cell1 | Cell2 |".to_string(),
357
+ page_number: 1,
358
+ };
359
+
360
+ let image = ExtractedImage {
361
+ data: vec![1, 2, 3, 4],
362
+ format: "jpeg".to_string(),
363
+ image_index: 0,
364
+ page_number: Some(1),
365
+ width: Some(640),
366
+ height: Some(480),
367
+ colorspace: Some("RGB".to_string()),
368
+ bits_per_component: Some(8),
369
+ is_mask: false,
370
+ description: None,
371
+ ocr_result: None,
372
+ };
373
+
374
+ let result = ExtractionResult {
375
+ content: "Test content".to_string(),
376
+ mime_type: "application/pdf".to_string(),
377
+ metadata: test_metadata(Some("Test".to_string())),
378
+ tables: vec![],
379
+ detected_languages: None,
380
+ chunks: None,
381
+ images: None,
382
+ djot_content: None,
383
+ pages: Some(vec![PageContent {
384
+ page_number: 1,
385
+ content: "Some text".to_string(),
386
+ tables: vec![Arc::new(table)],
387
+ images: vec![Arc::new(image)],
388
+ hierarchy: None,
389
+ }]),
390
+ elements: None,
391
+ };
392
+
393
+ let elements = transform_extraction_result_to_elements(&result);
394
+
395
+ // Find table elements
396
+ use crate::types::ElementType;
397
+ let tables: Vec<_> = elements
398
+ .iter()
399
+ .filter(|e| e.element_type == ElementType::Table)
400
+ .collect();
401
+ assert_eq!(tables.len(), 1, "Should have 1 table element");
402
+ assert!(tables[0].text.contains("Header1"));
403
+ assert!(tables[0].text.contains("Cell2"));
404
+
405
+ // Find image elements
406
+ let images: Vec<_> = elements
407
+ .iter()
408
+ .filter(|e| e.element_type == ElementType::Image)
409
+ .collect();
410
+ assert_eq!(images.len(), 1, "Should have 1 image element");
411
+ assert!(images[0].text.contains("jpeg"));
412
+ assert!(images[0].text.contains("640"));
413
+ assert!(images[0].text.contains("480"));
414
+ assert_eq!(images[0].metadata.page_number, Some(1));
415
+ }
416
+
417
+ #[test]
418
+ fn test_transform_fallback_no_pages() {
419
+ use crate::types::{ElementType, ExtractionResult};
420
+
421
+ // Create a result without pages
422
+ let result = ExtractionResult {
423
+ content: "Simple text content\n\nSecond paragraph".to_string(),
424
+ mime_type: "text/plain".to_string(),
425
+ metadata: test_metadata(Some("Simple Doc".to_string())),
426
+ tables: vec![],
427
+ detected_languages: None,
428
+ chunks: None,
429
+ images: None,
430
+ djot_content: None,
431
+ pages: None,
432
+ elements: None,
433
+ };
434
+
435
+ let elements = transform_extraction_result_to_elements(&result);
436
+
437
+ // Should have narrative text elements
438
+ let narratives: Vec<_> = elements
439
+ .iter()
440
+ .filter(|e| e.element_type == ElementType::NarrativeText)
441
+ .collect();
442
+ assert!(!narratives.is_empty(), "Should have narrative text elements");
443
+
444
+ // All elements should have page_number = 1 (fallback)
445
+ for element in &elements {
446
+ assert_eq!(element.metadata.page_number, Some(1));
447
+ }
448
+ }
449
+
450
+ #[test]
451
+ fn test_paragraph_splitting() {
452
+ use crate::types::{ElementType, ExtractionResult};
453
+
454
+ let result = ExtractionResult {
455
+ content: "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.".to_string(),
456
+ mime_type: "text/plain".to_string(),
457
+ metadata: test_metadata(None),
458
+ tables: vec![],
459
+ detected_languages: None,
460
+ chunks: None,
461
+ images: None,
462
+ djot_content: None,
463
+ pages: None,
464
+ elements: None,
465
+ };
466
+
467
+ let elements = transform_extraction_result_to_elements(&result);
468
+
469
+ let narratives: Vec<_> = elements
470
+ .iter()
471
+ .filter(|e| e.element_type == ElementType::NarrativeText)
472
+ .collect();
473
+
474
+ // Should split into 3 separate paragraphs
475
+ assert_eq!(narratives.len(), 3, "Should split into 3 paragraphs");
476
+ assert_eq!(narratives[0].text, "First paragraph.");
477
+ assert_eq!(narratives[1].text, "Second paragraph.");
478
+ assert_eq!(narratives[2].text, "Third paragraph.");
479
+ }
480
+ }