kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,560 @@
1
+ //! Metadata types for extraction results.
2
+ //!
3
+ //! This module defines metadata structures for various document formats.
4
+
5
+ use serde::{Deserialize, Serialize};
6
+ use std::collections::{BTreeMap, HashMap};
7
+
8
+ #[cfg(feature = "pdf")]
9
+ use crate::pdf::metadata::PdfMetadata;
10
+
11
+ use super::formats::ImagePreprocessingMetadata;
12
+ use super::page::PageStructure;
13
+
14
+ /// Format-specific metadata (discriminated union).
15
+ ///
16
+ /// Only one format type can exist per extraction result. This provides
17
+ /// type-safe, clean metadata without nested optionals.
18
+ #[derive(Debug, Clone, Serialize, Deserialize)]
19
+ #[serde(tag = "format_type", rename_all = "snake_case")]
20
+ pub enum FormatMetadata {
21
+ #[cfg(feature = "pdf")]
22
+ Pdf(PdfMetadata),
23
+ Excel(ExcelMetadata),
24
+ Email(EmailMetadata),
25
+ Pptx(PptxMetadata),
26
+ Archive(ArchiveMetadata),
27
+ Image(ImageMetadata),
28
+ Xml(XmlMetadata),
29
+ Text(TextMetadata),
30
+ Html(Box<HtmlMetadata>),
31
+ Ocr(OcrMetadata),
32
+ }
33
+
34
+ /// Extraction result metadata.
35
+ ///
36
+ /// Contains common fields applicable to all formats, format-specific metadata
37
+ /// via a discriminated union, and additional custom fields from postprocessors.
38
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
39
+ pub struct Metadata {
40
+ /// Document title
41
+ #[serde(skip_serializing_if = "Option::is_none")]
42
+ pub title: Option<String>,
43
+
44
+ /// Document subject or description
45
+ #[serde(skip_serializing_if = "Option::is_none")]
46
+ pub subject: Option<String>,
47
+
48
+ /// Primary author(s) - always Vec for consistency
49
+ #[serde(skip_serializing_if = "Option::is_none")]
50
+ pub authors: Option<Vec<String>>,
51
+
52
+ /// Keywords/tags - always Vec for consistency
53
+ #[serde(skip_serializing_if = "Option::is_none")]
54
+ pub keywords: Option<Vec<String>>,
55
+
56
+ /// Primary language (ISO 639 code)
57
+ #[serde(skip_serializing_if = "Option::is_none")]
58
+ pub language: Option<String>,
59
+
60
+ /// Creation timestamp (ISO 8601 format)
61
+ #[serde(skip_serializing_if = "Option::is_none")]
62
+ pub created_at: Option<String>,
63
+
64
+ /// Last modification timestamp (ISO 8601 format)
65
+ #[serde(skip_serializing_if = "Option::is_none")]
66
+ pub modified_at: Option<String>,
67
+
68
+ /// User who created the document
69
+ #[serde(skip_serializing_if = "Option::is_none")]
70
+ pub created_by: Option<String>,
71
+
72
+ /// User who last modified the document
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub modified_by: Option<String>,
75
+
76
+ /// Page/slide/sheet structure with boundaries
77
+ #[serde(skip_serializing_if = "Option::is_none")]
78
+ pub pages: Option<PageStructure>,
79
+
80
+ /// Format-specific metadata (discriminated union)
81
+ ///
82
+ /// Contains detailed metadata specific to the document format.
83
+ /// Serializes with a `format_type` discriminator field.
84
+ #[serde(flatten, skip_serializing_if = "Option::is_none")]
85
+ pub format: Option<FormatMetadata>,
86
+
87
+ /// Image preprocessing metadata (when OCR preprocessing was applied)
88
+ #[serde(skip_serializing_if = "Option::is_none")]
89
+ pub image_preprocessing: Option<ImagePreprocessingMetadata>,
90
+
91
+ /// JSON schema (for structured data extraction)
92
+ #[serde(skip_serializing_if = "Option::is_none")]
93
+ pub json_schema: Option<serde_json::Value>,
94
+
95
+ /// Error metadata (for batch operations)
96
+ #[serde(skip_serializing_if = "Option::is_none")]
97
+ pub error: Option<ErrorMetadata>,
98
+
99
+ /// Additional custom fields from postprocessors.
100
+ ///
101
+ /// This flattened HashMap allows Python/TypeScript postprocessors to add
102
+ /// arbitrary fields (entity extraction, keyword extraction, etc.).
103
+ /// Fields are merged at the root level during serialization.
104
+ #[serde(flatten)]
105
+ pub additional: HashMap<String, serde_json::Value>,
106
+ }
107
+
108
+ /// Excel/spreadsheet metadata.
109
+ ///
110
+ /// Contains information about sheets in Excel, LibreOffice Calc, and other
111
+ /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
112
+ #[derive(Debug, Clone, Serialize, Deserialize)]
113
+ pub struct ExcelMetadata {
114
+ /// Total number of sheets in the workbook
115
+ pub sheet_count: usize,
116
+ /// Names of all sheets in order
117
+ pub sheet_names: Vec<String>,
118
+ }
119
+
120
+ /// Email metadata extracted from .eml and .msg files.
121
+ ///
122
+ /// Includes sender/recipient information, message ID, and attachment list.
123
+ #[derive(Debug, Clone, Serialize, Deserialize)]
124
+ pub struct EmailMetadata {
125
+ /// Sender's email address
126
+ #[serde(skip_serializing_if = "Option::is_none")]
127
+ pub from_email: Option<String>,
128
+
129
+ /// Sender's display name
130
+ #[serde(skip_serializing_if = "Option::is_none")]
131
+ pub from_name: Option<String>,
132
+
133
+ /// Primary recipients
134
+ pub to_emails: Vec<String>,
135
+ /// CC recipients
136
+ pub cc_emails: Vec<String>,
137
+ /// BCC recipients
138
+ pub bcc_emails: Vec<String>,
139
+
140
+ /// Message-ID header value
141
+ #[serde(skip_serializing_if = "Option::is_none")]
142
+ pub message_id: Option<String>,
143
+
144
+ /// List of attachment filenames
145
+ pub attachments: Vec<String>,
146
+ }
147
+
148
+ /// Archive (ZIP/TAR/7Z) metadata.
149
+ ///
150
+ /// Extracted from compressed archive files containing file lists and size information.
151
+ #[derive(Debug, Clone, Serialize, Deserialize)]
152
+ pub struct ArchiveMetadata {
153
+ /// Archive format ("ZIP", "TAR", "7Z", etc.)
154
+ pub format: String,
155
+ /// Total number of files in the archive
156
+ pub file_count: usize,
157
+ /// List of file paths within the archive
158
+ pub file_list: Vec<String>,
159
+ /// Total uncompressed size in bytes
160
+ pub total_size: usize,
161
+
162
+ /// Compressed size in bytes (if available)
163
+ #[serde(skip_serializing_if = "Option::is_none")]
164
+ pub compressed_size: Option<usize>,
165
+ }
166
+
167
+ /// Image metadata extracted from image files.
168
+ ///
169
+ /// Includes dimensions, format, and EXIF data.
170
+ #[derive(Debug, Clone, Serialize, Deserialize)]
171
+ pub struct ImageMetadata {
172
+ /// Image width in pixels
173
+ pub width: u32,
174
+ /// Image height in pixels
175
+ pub height: u32,
176
+ /// Image format (e.g., "PNG", "JPEG", "TIFF")
177
+ pub format: String,
178
+ /// EXIF metadata tags
179
+ pub exif: HashMap<String, String>,
180
+ }
181
+
182
+ /// XML metadata extracted during XML parsing.
183
+ ///
184
+ /// Provides statistics about XML document structure.
185
+ #[derive(Debug, Clone, Serialize, Deserialize)]
186
+ pub struct XmlMetadata {
187
+ /// Total number of XML elements processed
188
+ pub element_count: usize,
189
+ /// List of unique element tag names (sorted)
190
+ pub unique_elements: Vec<String>,
191
+ }
192
+
193
+ /// Text/Markdown metadata.
194
+ ///
195
+ /// Extracted from plain text and Markdown files. Includes word counts and,
196
+ /// for Markdown, structural elements like headers and links.
197
+ #[derive(Debug, Clone, Serialize, Deserialize)]
198
+ pub struct TextMetadata {
199
+ /// Number of lines in the document
200
+ pub line_count: usize,
201
+ /// Number of words
202
+ pub word_count: usize,
203
+ /// Number of characters
204
+ pub character_count: usize,
205
+
206
+ /// Markdown headers (headings text only, for Markdown files)
207
+ #[serde(skip_serializing_if = "Option::is_none")]
208
+ pub headers: Option<Vec<String>>,
209
+
210
+ /// Markdown links as (text, url) tuples (for Markdown files)
211
+ #[serde(skip_serializing_if = "Option::is_none")]
212
+ pub links: Option<Vec<(String, String)>>,
213
+
214
+ /// Code blocks as (language, code) tuples (for Markdown files)
215
+ #[serde(skip_serializing_if = "Option::is_none")]
216
+ pub code_blocks: Option<Vec<(String, String)>>,
217
+ }
218
+
219
+ /// Text direction enumeration for HTML documents.
220
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
221
+ #[serde(rename_all = "lowercase")]
222
+ pub enum TextDirection {
223
+ /// Left-to-right text direction
224
+ #[serde(rename = "ltr")]
225
+ LeftToRight,
226
+ /// Right-to-left text direction
227
+ #[serde(rename = "rtl")]
228
+ RightToLeft,
229
+ /// Automatic text direction detection
230
+ #[serde(rename = "auto")]
231
+ Auto,
232
+ }
233
+
234
+ /// Header/heading element metadata.
235
+ #[derive(Debug, Clone, Serialize, Deserialize)]
236
+ pub struct HeaderMetadata {
237
+ /// Header level: 1 (h1) through 6 (h6)
238
+ pub level: u8,
239
+ /// Normalized text content of the header
240
+ pub text: String,
241
+ /// HTML id attribute if present
242
+ #[serde(skip_serializing_if = "Option::is_none")]
243
+ pub id: Option<String>,
244
+ /// Document tree depth at the header element
245
+ pub depth: usize,
246
+ /// Byte offset in original HTML document
247
+ pub html_offset: usize,
248
+ }
249
+
250
+ /// Link element metadata.
251
+ #[derive(Debug, Clone, Serialize, Deserialize)]
252
+ pub struct LinkMetadata {
253
+ /// The href URL value
254
+ pub href: String,
255
+ /// Link text content (normalized)
256
+ pub text: String,
257
+ /// Optional title attribute
258
+ #[serde(skip_serializing_if = "Option::is_none")]
259
+ pub title: Option<String>,
260
+ /// Link type classification
261
+ pub link_type: LinkType,
262
+ /// Rel attribute values
263
+ pub rel: Vec<String>,
264
+ /// Additional attributes as key-value pairs
265
+ pub attributes: HashMap<String, String>,
266
+ }
267
+
268
+ /// Link type classification.
269
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
270
+ #[serde(rename_all = "lowercase")]
271
+ pub enum LinkType {
272
+ /// Anchor link (#section)
273
+ Anchor,
274
+ /// Internal link (same domain)
275
+ Internal,
276
+ /// External link (different domain)
277
+ External,
278
+ /// Email link (mailto:)
279
+ Email,
280
+ /// Phone link (tel:)
281
+ Phone,
282
+ /// Other link type
283
+ Other,
284
+ }
285
+
286
+ /// Image element metadata.
287
+ #[derive(Debug, Clone, Serialize, Deserialize)]
288
+ pub struct ImageMetadataType {
289
+ /// Image source (URL, data URI, or SVG content)
290
+ pub src: String,
291
+ /// Alternative text from alt attribute
292
+ #[serde(skip_serializing_if = "Option::is_none")]
293
+ pub alt: Option<String>,
294
+ /// Title attribute
295
+ #[serde(skip_serializing_if = "Option::is_none")]
296
+ pub title: Option<String>,
297
+ /// Image dimensions as (width, height) if available
298
+ pub dimensions: Option<(u32, u32)>,
299
+ /// Image type classification
300
+ pub image_type: ImageType,
301
+ /// Additional attributes as key-value pairs
302
+ pub attributes: HashMap<String, String>,
303
+ }
304
+
305
+ /// Image type classification.
306
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
307
+ #[serde(rename_all = "lowercase")]
308
+ pub enum ImageType {
309
+ /// Data URI image
310
+ #[serde(rename = "data-uri")]
311
+ DataUri,
312
+ /// Inline SVG
313
+ #[serde(rename = "inline-svg")]
314
+ InlineSvg,
315
+ /// External image URL
316
+ External,
317
+ /// Relative path image
318
+ Relative,
319
+ }
320
+
321
+ /// Structured data (Schema.org, microdata, RDFa) block.
322
+ #[derive(Debug, Clone, Serialize, Deserialize)]
323
+ pub struct StructuredData {
324
+ /// Type of structured data
325
+ pub data_type: StructuredDataType,
326
+ /// Raw JSON string representation
327
+ pub raw_json: String,
328
+ /// Schema type if detectable (e.g., "Article", "Event", "Product")
329
+ #[serde(skip_serializing_if = "Option::is_none")]
330
+ pub schema_type: Option<String>,
331
+ }
332
+
333
+ /// Structured data type classification.
334
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
335
+ #[serde(rename_all = "lowercase")]
336
+ pub enum StructuredDataType {
337
+ /// JSON-LD structured data
338
+ #[serde(rename = "json-ld")]
339
+ JsonLd,
340
+ /// Microdata
341
+ Microdata,
342
+ /// RDFa
343
+ #[serde(rename = "rdfa")]
344
+ RDFa,
345
+ }
346
+
347
+ /// HTML metadata extracted from HTML documents.
348
+ ///
349
+ /// Includes document-level metadata, Open Graph data, Twitter Card metadata,
350
+ /// and extracted structural elements (headers, links, images, structured data).
351
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
352
+ pub struct HtmlMetadata {
353
+ /// Document title from `<title>` tag
354
+ #[serde(skip_serializing_if = "Option::is_none")]
355
+ pub title: Option<String>,
356
+
357
+ /// Document description from `<meta name="description">` tag
358
+ #[serde(skip_serializing_if = "Option::is_none")]
359
+ pub description: Option<String>,
360
+
361
+ /// Document keywords from `<meta name="keywords">` tag, split on commas
362
+ #[serde(default)]
363
+ pub keywords: Vec<String>,
364
+
365
+ /// Document author from `<meta name="author">` tag
366
+ #[serde(skip_serializing_if = "Option::is_none")]
367
+ pub author: Option<String>,
368
+
369
+ /// Canonical URL from `<link rel="canonical">` tag
370
+ #[serde(skip_serializing_if = "Option::is_none")]
371
+ pub canonical_url: Option<String>,
372
+
373
+ /// Base URL from `<base href="">` tag for resolving relative URLs
374
+ #[serde(skip_serializing_if = "Option::is_none")]
375
+ pub base_href: Option<String>,
376
+
377
+ /// Document language from `lang` attribute
378
+ #[serde(skip_serializing_if = "Option::is_none")]
379
+ pub language: Option<String>,
380
+
381
+ /// Document text direction from `dir` attribute
382
+ #[serde(skip_serializing_if = "Option::is_none")]
383
+ pub text_direction: Option<TextDirection>,
384
+
385
+ /// Open Graph metadata (og:* properties) for social media
386
+ /// Keys like "title", "description", "image", "url", etc.
387
+ #[serde(default)]
388
+ pub open_graph: BTreeMap<String, String>,
389
+
390
+ /// Twitter Card metadata (twitter:* properties)
391
+ /// Keys like "card", "site", "creator", "title", "description", "image", etc.
392
+ #[serde(default)]
393
+ pub twitter_card: BTreeMap<String, String>,
394
+
395
+ /// Additional meta tags not covered by specific fields
396
+ /// Keys are meta name/property attributes, values are content
397
+ #[serde(default)]
398
+ pub meta_tags: BTreeMap<String, String>,
399
+
400
+ /// Extracted header elements with hierarchy
401
+ #[serde(default)]
402
+ pub headers: Vec<HeaderMetadata>,
403
+
404
+ /// Extracted hyperlinks with type classification
405
+ #[serde(default)]
406
+ pub links: Vec<LinkMetadata>,
407
+
408
+ /// Extracted images with source and dimensions
409
+ #[serde(default)]
410
+ pub images: Vec<ImageMetadataType>,
411
+
412
+ /// Extracted structured data blocks
413
+ #[serde(default)]
414
+ pub structured_data: Vec<StructuredData>,
415
+ }
416
+
417
+ impl HtmlMetadata {
418
+ /// Check if metadata is empty (no meaningful content extracted).
419
+ pub fn is_empty(&self) -> bool {
420
+ self.title.is_none()
421
+ && self.description.is_none()
422
+ && self.keywords.is_empty()
423
+ && self.author.is_none()
424
+ && self.canonical_url.is_none()
425
+ && self.base_href.is_none()
426
+ && self.language.is_none()
427
+ && self.text_direction.is_none()
428
+ && self.open_graph.is_empty()
429
+ && self.twitter_card.is_empty()
430
+ && self.meta_tags.is_empty()
431
+ && self.headers.is_empty()
432
+ && self.links.is_empty()
433
+ && self.images.is_empty()
434
+ && self.structured_data.is_empty()
435
+ }
436
+ }
437
+
438
+ #[cfg(feature = "html")]
439
+ impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
440
+ fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
441
+ let text_dir = metadata.document.text_direction.map(|td| match td {
442
+ html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
443
+ html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
444
+ html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
445
+ });
446
+
447
+ HtmlMetadata {
448
+ title: metadata.document.title,
449
+ description: metadata.document.description,
450
+ keywords: metadata.document.keywords,
451
+ author: metadata.document.author,
452
+ canonical_url: metadata.document.canonical_url,
453
+ base_href: metadata.document.base_href,
454
+ language: metadata.document.language,
455
+ text_direction: text_dir,
456
+ open_graph: metadata.document.open_graph,
457
+ twitter_card: metadata.document.twitter_card,
458
+ meta_tags: metadata.document.meta_tags,
459
+ headers: metadata
460
+ .headers
461
+ .into_iter()
462
+ .map(|h| HeaderMetadata {
463
+ level: h.level,
464
+ text: h.text,
465
+ id: h.id,
466
+ depth: h.depth,
467
+ html_offset: h.html_offset,
468
+ })
469
+ .collect(),
470
+ links: metadata
471
+ .links
472
+ .into_iter()
473
+ .map(|l| LinkMetadata {
474
+ href: l.href,
475
+ text: l.text,
476
+ title: l.title,
477
+ link_type: match l.link_type {
478
+ html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
479
+ html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
480
+ html_to_markdown_rs::LinkType::External => LinkType::External,
481
+ html_to_markdown_rs::LinkType::Email => LinkType::Email,
482
+ html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
483
+ html_to_markdown_rs::LinkType::Other => LinkType::Other,
484
+ },
485
+ rel: l.rel,
486
+ attributes: l.attributes.into_iter().collect(),
487
+ })
488
+ .collect(),
489
+ images: metadata
490
+ .images
491
+ .into_iter()
492
+ .map(|img| ImageMetadataType {
493
+ src: img.src,
494
+ alt: img.alt,
495
+ title: img.title,
496
+ dimensions: img.dimensions,
497
+ image_type: match img.image_type {
498
+ html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
499
+ html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
500
+ html_to_markdown_rs::ImageType::External => ImageType::External,
501
+ html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
502
+ },
503
+ attributes: img.attributes.into_iter().collect(),
504
+ })
505
+ .collect(),
506
+ structured_data: metadata
507
+ .structured_data
508
+ .into_iter()
509
+ .map(|sd| StructuredData {
510
+ data_type: match sd.data_type {
511
+ html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
512
+ html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
513
+ html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
514
+ },
515
+ raw_json: sd.raw_json,
516
+ schema_type: sd.schema_type,
517
+ })
518
+ .collect(),
519
+ }
520
+ }
521
+ }
522
+
523
+ /// OCR processing metadata.
524
+ ///
525
+ /// Captures information about OCR processing configuration and results.
526
+ #[derive(Debug, Clone, Serialize, Deserialize)]
527
+ pub struct OcrMetadata {
528
+ /// OCR language code(s) used
529
+ pub language: String,
530
+ /// Tesseract Page Segmentation Mode (PSM)
531
+ pub psm: i32,
532
+ /// Output format (e.g., "text", "hocr")
533
+ pub output_format: String,
534
+ /// Number of tables detected
535
+ pub table_count: usize,
536
+
537
+ #[serde(skip_serializing_if = "Option::is_none")]
538
+ pub table_rows: Option<usize>,
539
+
540
+ #[serde(skip_serializing_if = "Option::is_none")]
541
+ pub table_cols: Option<usize>,
542
+ }
543
+
544
+ /// Error metadata (for batch operations).
545
+ #[derive(Debug, Clone, Serialize, Deserialize)]
546
+ pub struct ErrorMetadata {
547
+ pub error_type: String,
548
+ pub message: String,
549
+ }
550
+
551
+ /// PowerPoint presentation metadata.
552
+ ///
553
+ /// Extracted from PPTX files containing slide counts and presentation details.
554
+ #[derive(Debug, Clone, Serialize, Deserialize)]
555
+ pub struct PptxMetadata {
556
+ /// Total number of slides in the presentation
557
+ pub slide_count: usize,
558
+ /// Names of slides (if available)
559
+ pub slide_names: Vec<String>,
560
+ }