kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,103 @@
1
+ //! Stack management for HTML extraction with support for large documents.
2
+ //!
3
+ //! This module handles the specialized concern of managing stack size for HTML conversion,
4
+ //! particularly for large HTML documents that may require more stack space than the default.
5
+ //! On WASM, stack size is limited and cannot be increased, so size limits are enforced.
6
+ //! On native platforms, dedicated threads with larger stacks are used for large HTML.
7
+
8
+ use crate::error::{KreuzbergError, Result};
9
+
10
+ #[cfg(not(target_arch = "wasm32"))]
11
+ use std::{any::Any, thread};
12
+
13
+ #[cfg(target_arch = "wasm32")]
14
+ pub const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024;
15
+
16
+ #[cfg(not(target_arch = "wasm32"))]
17
+ pub const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
18
+
19
+ #[cfg(not(target_arch = "wasm32"))]
20
+ pub const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
21
+
22
+ /// Check if HTML size exceeds WASM limit and return error if so.
23
+ ///
24
+ /// WASM builds have a fixed stack size that cannot be increased, so we enforce
25
+ /// a 2MB limit to prevent stack overflow during HTML conversion.
26
+ #[cfg(target_arch = "wasm32")]
27
+ pub fn check_wasm_size_limit(html: &str) -> Result<()> {
28
+ if html.len() > MAX_HTML_SIZE_BYTES {
29
+ return Err(KreuzbergError::validation(format!(
30
+ "HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
31
+ Large HTML files cannot be processed in WASM due to stack constraints. \
32
+ Consider using the native library for files of this size.",
33
+ html.len(),
34
+ MAX_HTML_SIZE_BYTES
35
+ )));
36
+ }
37
+ Ok(())
38
+ }
39
+
40
+ /// Check if HTML size exceeds WASM limit and return error if so.
41
+ ///
42
+ /// No-op on non-WASM platforms.
43
+ #[cfg(not(target_arch = "wasm32"))]
44
+ pub fn check_wasm_size_limit(_html: &str) -> Result<()> {
45
+ Ok(())
46
+ }
47
+
48
+ /// Determine if HTML requires a dedicated stack due to size.
49
+ ///
50
+ /// On native platforms, HTML larger than the threshold will be processed
51
+ /// on a dedicated thread with a larger stack to prevent overflow.
52
+ #[cfg(not(target_arch = "wasm32"))]
53
+ pub fn html_requires_large_stack(len: usize) -> bool {
54
+ len >= LARGE_HTML_STACK_THRESHOLD_BYTES
55
+ }
56
+
57
+ /// Run a job on a dedicated thread with a large stack.
58
+ ///
59
+ /// This is useful for HTML conversion of large documents that might
60
+ /// overflow the default thread stack on native platforms.
61
+ ///
62
+ /// # Arguments
63
+ ///
64
+ /// * `job` - The closure to execute on the dedicated thread
65
+ ///
66
+ /// # Returns
67
+ ///
68
+ /// The result of the job execution, or an error if the thread panics
69
+ #[cfg(not(target_arch = "wasm32"))]
70
+ pub fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
71
+ where
72
+ T: Send + 'static,
73
+ F: FnOnce() -> Result<T> + Send + 'static,
74
+ {
75
+ let handle = thread::Builder::new()
76
+ .name("kreuzberg-html-conversion".to_string())
77
+ .stack_size(HTML_CONVERSION_STACK_SIZE_BYTES)
78
+ .spawn(job)
79
+ .map_err(|err| KreuzbergError::Other(format!("Failed to spawn HTML conversion thread: {}", err)))?;
80
+
81
+ match handle.join() {
82
+ Ok(result) => result,
83
+ Err(panic) => {
84
+ let reason = extract_panic_reason(&panic);
85
+ Err(KreuzbergError::Other(format!("HTML conversion panicked: {}", reason)))
86
+ }
87
+ }
88
+ }
89
+
90
+ /// Extract a human-readable reason from a panic.
91
+ ///
92
+ /// Attempts to downcast the panic value to either &str or String,
93
+ /// falling back to a generic message if neither succeeds.
94
+ #[cfg(not(target_arch = "wasm32"))]
95
+ fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
96
+ if let Some(msg) = panic.downcast_ref::<&str>() {
97
+ (*msg).to_string()
98
+ } else if let Some(msg) = panic.downcast_ref::<String>() {
99
+ msg.clone()
100
+ } else {
101
+ "unknown panic".to_string()
102
+ }
103
+ }
@@ -0,0 +1,28 @@
1
+ //! Type definitions for HTML extraction.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+ use std::collections::HashMap;
5
+
6
+ pub use html_to_markdown_rs::{
7
+ CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
8
+ PreprocessingPreset, WhitespaceMode,
9
+ };
10
+
11
+ /// Result of HTML extraction with optional images and warnings.
12
+ #[derive(Debug, Clone, Serialize, Deserialize)]
13
+ pub struct HtmlExtractionResult {
14
+ pub markdown: String,
15
+ pub images: Vec<ExtractedInlineImage>,
16
+ pub warnings: Vec<String>,
17
+ }
18
+
19
+ /// Extracted inline image with metadata.
20
+ #[derive(Debug, Clone, Serialize, Deserialize)]
21
+ pub struct ExtractedInlineImage {
22
+ pub data: Vec<u8>,
23
+ pub format: String,
24
+ pub filename: Option<String>,
25
+ pub description: Option<String>,
26
+ pub dimensions: Option<(u32, u32)>,
27
+ pub attributes: HashMap<String, String>,
28
+ }
@@ -1,5 +1,6 @@
1
1
  pub mod structured;
2
2
  pub mod text;
3
+ pub mod transform;
3
4
 
4
5
  #[cfg(feature = "ocr")]
5
6
  pub mod image;
@@ -41,11 +42,14 @@ pub mod table;
41
42
  #[cfg(feature = "xml")]
42
43
  pub mod xml;
43
44
 
44
- #[cfg(any(feature = "office", feature = "html"))]
45
+ #[cfg(any(feature = "office", feature = "html", feature = "xml"))]
45
46
  pub mod markdown;
46
47
 
47
48
  pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
48
49
  pub use text::parse_text;
50
+ pub use transform::{
51
+ ListItemMetadata, ListType, detect_list_items, generate_element_id, transform_extraction_result_to_elements,
52
+ };
49
53
 
50
54
  #[cfg(feature = "ocr")]
51
55
  pub use image::{ImageMetadata, extract_image_metadata};
@@ -84,7 +88,7 @@ pub use table::table_from_arrow_to_markdown;
84
88
  #[cfg(feature = "xml")]
85
89
  pub use xml::parse_xml;
86
90
 
87
- #[cfg(any(feature = "office", feature = "html"))]
91
+ #[cfg(any(feature = "office", feature = "html", feature = "xml"))]
88
92
  pub use markdown::cells_to_markdown;
89
93
 
90
94
  pub use capacity::{
@@ -0,0 +1,159 @@
1
+ //! PPTX container and ZIP archive management.
2
+ //!
3
+ //! This module handles opening PPTX files, reading files from the ZIP archive,
4
+ //! finding slide paths, and iterating through slides.
5
+
6
+ use std::collections::HashMap;
7
+ use std::fs::File;
8
+ use std::io::Read;
9
+ use std::path::Path;
10
+ use zip::ZipArchive;
11
+
12
+ use super::elements::Slide;
13
+ use super::image_handling::get_full_image_path;
14
+ use crate::error::{KreuzbergError, Result};
15
+
16
+ pub(super) struct PptxContainer {
17
+ pub(super) archive: ZipArchive<File>,
18
+ slide_paths: Vec<String>,
19
+ }
20
+
21
+ impl PptxContainer {
22
+ pub(super) fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
23
+ // IO errors must bubble up unchanged - file access issues need user reports ~keep
24
+ let file = File::open(path)?;
25
+
26
+ let mut archive = match ZipArchive::new(file) {
27
+ Ok(arc) => arc,
28
+ Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
29
+ Err(e) => {
30
+ return Err(KreuzbergError::parsing(format!(
31
+ "Failed to read PPTX archive (invalid format): {}",
32
+ e
33
+ )));
34
+ }
35
+ };
36
+
37
+ let slide_paths = Self::find_slide_paths(&mut archive)?;
38
+
39
+ Ok(Self { archive, slide_paths })
40
+ }
41
+
42
+ pub(super) fn slide_paths(&self) -> &[String] {
43
+ &self.slide_paths
44
+ }
45
+
46
+ pub(super) fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
47
+ match self.archive.by_name(path) {
48
+ Ok(mut file) => {
49
+ let mut contents = Vec::new();
50
+ // IO errors must bubble up - file read issues need user reports ~keep
51
+ file.read_to_end(&mut contents)?;
52
+ Ok(contents)
53
+ }
54
+ Err(zip::result::ZipError::FileNotFound) => {
55
+ Err(KreuzbergError::parsing("File not found in archive".to_string()))
56
+ }
57
+ Err(zip::result::ZipError::Io(io_err)) => Err(io_err.into()), // Bubble up IO errors ~keep
58
+ Err(e) => Err(KreuzbergError::parsing(format!("Zip error: {}", e))),
59
+ }
60
+ }
61
+
62
+ pub(super) fn get_slide_rels_path(&self, slide_path: &str) -> String {
63
+ super::image_handling::get_slide_rels_path(slide_path)
64
+ }
65
+
66
+ fn find_slide_paths(archive: &mut ZipArchive<File>) -> Result<Vec<String>> {
67
+ if let Ok(rels_data) = Self::read_file_from_archive(archive, "ppt/_rels/presentation.xml.rels")
68
+ && let Ok(paths) = super::parser::parse_presentation_rels(&rels_data)
69
+ {
70
+ return Ok(paths);
71
+ }
72
+
73
+ let mut slide_paths = Vec::new();
74
+ for i in 0..archive.len() {
75
+ if let Ok(file) = archive.by_index(i) {
76
+ let name = file.name();
77
+ if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
78
+ slide_paths.push(name.to_string());
79
+ }
80
+ }
81
+ }
82
+
83
+ slide_paths.sort();
84
+ Ok(slide_paths)
85
+ }
86
+
87
+ fn read_file_from_archive(archive: &mut ZipArchive<File>, path: &str) -> Result<Vec<u8>> {
88
+ let mut file = match archive.by_name(path) {
89
+ Ok(f) => f,
90
+ Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
91
+ Err(e) => {
92
+ return Err(KreuzbergError::parsing(format!(
93
+ "Failed to read file from archive: {}",
94
+ e
95
+ )));
96
+ }
97
+ };
98
+ let mut contents = Vec::new();
99
+ // IO errors must bubble up - file read issues need user reports ~keep
100
+ file.read_to_end(&mut contents)?;
101
+ Ok(contents)
102
+ }
103
+ }
104
+
105
+ pub(super) struct SlideIterator {
106
+ container: PptxContainer,
107
+ current_index: usize,
108
+ total_slides: usize,
109
+ }
110
+
111
+ impl SlideIterator {
112
+ pub(super) fn new(container: PptxContainer) -> Self {
113
+ let total_slides = container.slide_paths().len();
114
+ Self {
115
+ container,
116
+ current_index: 0,
117
+ total_slides,
118
+ }
119
+ }
120
+
121
+ pub(super) fn slide_count(&self) -> usize {
122
+ self.total_slides
123
+ }
124
+
125
+ pub(super) fn next_slide(&mut self) -> Result<Option<Slide>> {
126
+ if self.current_index >= self.total_slides {
127
+ return Ok(None);
128
+ }
129
+
130
+ let slide_path = &self.container.slide_paths()[self.current_index].clone();
131
+ let slide_number = (self.current_index + 1) as u32;
132
+
133
+ let xml_data = self.container.read_file(slide_path)?;
134
+
135
+ let rels_path = self.container.get_slide_rels_path(slide_path);
136
+ let rels_data = self.container.read_file(&rels_path).ok();
137
+
138
+ let slide = Slide::from_xml(slide_number, &xml_data, rels_data.as_deref())?;
139
+
140
+ self.current_index += 1;
141
+
142
+ Ok(Some(slide))
143
+ }
144
+
145
+ pub(super) fn get_slide_images(&mut self, slide: &Slide) -> Result<HashMap<String, Vec<u8>>> {
146
+ let mut image_data = HashMap::new();
147
+
148
+ for img_ref in &slide.images {
149
+ let slide_path = &self.container.slide_paths()[slide.slide_number as usize - 1];
150
+ let full_path = get_full_image_path(slide_path, &img_ref.target);
151
+
152
+ if let Ok(data) = self.container.read_file(&full_path) {
153
+ image_data.insert(img_ref.id.clone(), data);
154
+ }
155
+ }
156
+
157
+ Ok(image_data)
158
+ }
159
+ }
@@ -0,0 +1,168 @@
1
+ //! Content builder for accumulating slide output.
2
+ //!
3
+ //! This module provides utilities for building the final markdown content
4
+ //! from slide elements and managing page boundaries.
5
+
6
+ pub(super) struct ContentBuilder {
7
+ pub(super) content: String,
8
+ pub(super) boundaries: Vec<crate::types::PageBoundary>,
9
+ pub(super) page_contents: Vec<crate::types::PageContent>,
10
+ pub(super) config: Option<crate::core::config::PageConfig>,
11
+ }
12
+
13
+ impl ContentBuilder {
14
+ pub(super) fn new() -> Self {
15
+ Self {
16
+ content: String::with_capacity(8192),
17
+ boundaries: Vec::new(),
18
+ page_contents: Vec::new(),
19
+ config: None,
20
+ }
21
+ }
22
+
23
+ pub(super) fn with_page_config(capacity: usize, config: Option<crate::core::config::PageConfig>) -> Self {
24
+ Self {
25
+ content: String::with_capacity(capacity),
26
+ boundaries: if config.is_some() {
27
+ Vec::new()
28
+ } else {
29
+ Vec::with_capacity(0)
30
+ },
31
+ page_contents: if config.is_some() {
32
+ Vec::new()
33
+ } else {
34
+ Vec::with_capacity(0)
35
+ },
36
+ config,
37
+ }
38
+ }
39
+
40
+ pub(super) fn start_slide(&mut self, slide_number: u32) -> usize {
41
+ let byte_start = self.content.len();
42
+
43
+ if let Some(ref cfg) = self.config
44
+ && cfg.insert_page_markers
45
+ {
46
+ let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
47
+ self.content.push_str(&marker);
48
+ }
49
+
50
+ byte_start
51
+ }
52
+
53
+ pub(super) fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
54
+ let byte_end = self.content.len();
55
+
56
+ if self.config.is_some() {
57
+ self.boundaries.push(crate::types::PageBoundary {
58
+ byte_start,
59
+ byte_end,
60
+ page_number: slide_number as usize,
61
+ });
62
+
63
+ self.page_contents.push(crate::types::PageContent {
64
+ page_number: slide_number as usize,
65
+ content: slide_content,
66
+ tables: Vec::new(),
67
+ images: Vec::new(),
68
+ hierarchy: None,
69
+ });
70
+ }
71
+ }
72
+
73
+ pub(super) fn add_slide_header(&mut self, slide_number: u32) {
74
+ self.content.reserve(50);
75
+ self.content.push_str("\n\n<!-- Slide number: ");
76
+ self.content.push_str(&slide_number.to_string());
77
+ self.content.push_str(" -->\n");
78
+ }
79
+
80
+ pub(super) fn add_text(&mut self, text: &str) {
81
+ if !text.trim().is_empty() {
82
+ self.content.push_str(text);
83
+ }
84
+ }
85
+
86
+ pub(super) fn add_title(&mut self, title: &str) {
87
+ if !title.trim().is_empty() {
88
+ self.content.push_str("# ");
89
+ self.content.push_str(title.trim());
90
+ self.content.push('\n');
91
+ }
92
+ }
93
+
94
+ pub(super) fn add_table(&mut self, rows: &[Vec<String>]) {
95
+ if rows.is_empty() {
96
+ return;
97
+ }
98
+
99
+ self.content.push_str("\n<table>");
100
+ for (i, row) in rows.iter().enumerate() {
101
+ self.content.push_str("<tr>");
102
+ let tag = if i == 0 { "th" } else { "td" };
103
+
104
+ for cell in row {
105
+ self.content.push('<');
106
+ self.content.push_str(tag);
107
+ self.content.push('>');
108
+ self.content.push_str(&super::image_handling::html_escape(cell));
109
+ self.content.push_str("</");
110
+ self.content.push_str(tag);
111
+ self.content.push('>');
112
+ }
113
+ self.content.push_str("</tr>");
114
+ }
115
+ self.content.push_str("</table>\n");
116
+ }
117
+
118
+ pub(super) fn add_list_item(&mut self, level: u32, is_ordered: bool, text: &str) {
119
+ let indent_count = level.saturating_sub(1) as usize;
120
+ for _ in 0..indent_count {
121
+ self.content.push_str(" ");
122
+ }
123
+
124
+ let marker = if is_ordered { "1." } else { "-" };
125
+ self.content.push_str(marker);
126
+ self.content.push(' ');
127
+ self.content.push_str(text.trim());
128
+ self.content.push('\n');
129
+ }
130
+
131
+ pub(super) fn add_image(&mut self, image_id: &str, slide_number: u32) {
132
+ let filename = format!("slide_{}_image_{}.jpg", slide_number, image_id);
133
+ self.content.push_str("![");
134
+ self.content.push_str(image_id);
135
+ self.content.push_str("](");
136
+ self.content.push_str(&filename);
137
+ self.content.push_str(")\n");
138
+ }
139
+
140
+ pub(super) fn add_notes(&mut self, notes: &str) {
141
+ if !notes.trim().is_empty() {
142
+ self.content.push_str("\n\n### Notes:\n");
143
+ self.content.push_str(notes);
144
+ self.content.push('\n');
145
+ }
146
+ }
147
+
148
+ pub(super) fn build(
149
+ self,
150
+ ) -> (
151
+ String,
152
+ Option<Vec<crate::types::PageBoundary>>,
153
+ Option<Vec<crate::types::PageContent>>,
154
+ ) {
155
+ let content = self.content.trim().to_string();
156
+ let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
157
+ Some(self.boundaries)
158
+ } else {
159
+ None
160
+ };
161
+ let pages = if self.config.is_some() && !self.page_contents.is_empty() {
162
+ Some(self.page_contents)
163
+ } else {
164
+ None
165
+ };
166
+ (content, boundaries, pages)
167
+ }
168
+ }
@@ -0,0 +1,132 @@
1
+ //! Internal types for PPTX extraction.
2
+ //!
3
+ //! This module defines the internal data structures used to represent
4
+ //! slide elements, formatting, and text runs during XML parsing.
5
+
6
+ #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
7
+ pub(super) struct ElementPosition {
8
+ pub(super) x: i64,
9
+ pub(super) y: i64,
10
+ }
11
+
12
+ #[derive(Debug, Clone, Default)]
13
+ pub(super) struct Formatting {
14
+ pub(super) bold: bool,
15
+ pub(super) italic: bool,
16
+ pub(super) underlined: bool,
17
+ pub(super) lang: String,
18
+ }
19
+
20
+ #[derive(Debug, Clone)]
21
+ pub(super) struct Run {
22
+ pub(super) text: String,
23
+ pub(super) formatting: Formatting,
24
+ }
25
+
26
+ impl Run {
27
+ pub(super) fn extract(&self) -> String {
28
+ self.text.clone()
29
+ }
30
+
31
+ pub(super) fn render_as_md(&self) -> String {
32
+ let mut result = self.text.clone();
33
+
34
+ if self.formatting.bold {
35
+ result = format!("**{}**", result);
36
+ }
37
+ if self.formatting.italic {
38
+ result = format!("*{}*", result);
39
+ }
40
+ if self.formatting.underlined {
41
+ result = format!("<u>{}</u>", result);
42
+ }
43
+
44
+ result
45
+ }
46
+ }
47
+
48
+ #[derive(Debug, Clone)]
49
+ pub(super) struct TextElement {
50
+ pub(super) runs: Vec<Run>,
51
+ }
52
+
53
+ #[derive(Debug, Clone)]
54
+ pub(super) struct ListItem {
55
+ pub(super) level: u32,
56
+ pub(super) is_ordered: bool,
57
+ pub(super) runs: Vec<Run>,
58
+ }
59
+
60
+ #[derive(Debug, Clone)]
61
+ pub(super) struct ListElement {
62
+ pub(super) items: Vec<ListItem>,
63
+ }
64
+
65
+ #[derive(Debug, Clone)]
66
+ pub(super) struct TableCell {
67
+ pub(super) runs: Vec<Run>,
68
+ }
69
+
70
+ #[derive(Debug, Clone)]
71
+ pub(super) struct TableRow {
72
+ pub(super) cells: Vec<TableCell>,
73
+ }
74
+
75
+ #[derive(Debug, Clone)]
76
+ pub(super) struct TableElement {
77
+ pub(super) rows: Vec<TableRow>,
78
+ }
79
+
80
+ #[derive(Debug, Clone)]
81
+ pub(super) struct ImageReference {
82
+ pub(super) id: String,
83
+ pub(super) target: String,
84
+ }
85
+
86
+ #[derive(Debug, Clone)]
87
+ pub(super) enum SlideElement {
88
+ Text(TextElement, ElementPosition),
89
+ Table(TableElement, ElementPosition),
90
+ Image(ImageReference, ElementPosition),
91
+ List(ListElement, ElementPosition),
92
+ Unknown,
93
+ }
94
+
95
+ impl SlideElement {
96
+ pub(super) fn position(&self) -> ElementPosition {
97
+ match self {
98
+ SlideElement::Text(_, pos)
99
+ | SlideElement::Table(_, pos)
100
+ | SlideElement::Image(_, pos)
101
+ | SlideElement::List(_, pos) => *pos,
102
+ SlideElement::Unknown => ElementPosition::default(),
103
+ }
104
+ }
105
+ }
106
+
107
+ #[derive(Debug)]
108
+ pub(super) struct Slide {
109
+ pub(super) slide_number: u32,
110
+ pub(super) elements: Vec<SlideElement>,
111
+ pub(super) images: Vec<ImageReference>,
112
+ }
113
+
114
+ #[derive(Debug, Clone)]
115
+ pub(super) struct ParserConfig {
116
+ pub(super) extract_images: bool,
117
+ pub(super) include_slide_comment: bool,
118
+ }
119
+
120
+ impl Default for ParserConfig {
121
+ fn default() -> Self {
122
+ Self {
123
+ extract_images: true,
124
+ include_slide_comment: false,
125
+ }
126
+ }
127
+ }
128
+
129
+ pub(super) enum ParsedContent {
130
+ Text(TextElement),
131
+ List(ListElement),
132
+ }
@@ -0,0 +1,57 @@
1
+ //! Image parsing and format detection.
2
+ //!
3
+ //! This module handles image-related parsing from slide XML and
4
+ //! detection of image formats from file data.
5
+
6
+ pub(super) fn html_escape(text: &str) -> String {
7
+ text.replace('&', "&amp;")
8
+ .replace('<', "&lt;")
9
+ .replace('>', "&gt;")
10
+ .replace('"', "&quot;")
11
+ .replace('\'', "&#x27;")
12
+ }
13
+
14
+ pub(super) fn detect_image_format(data: &[u8]) -> String {
15
+ if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
16
+ "jpeg".to_string()
17
+ } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
18
+ "png".to_string()
19
+ } else if data.starts_with(b"GIF") {
20
+ "gif".to_string()
21
+ } else if data.starts_with(b"BM") {
22
+ "bmp".to_string()
23
+ } else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
24
+ "svg".to_string()
25
+ } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
26
+ "tiff".to_string()
27
+ } else {
28
+ "unknown".to_string()
29
+ }
30
+ }
31
+
32
+ pub(super) fn get_slide_rels_path(slide_path: &str) -> String {
33
+ let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
34
+ if parts.len() == 2 {
35
+ format!("{}/_rels/{}.rels", parts[1], parts[0])
36
+ } else {
37
+ format!("_rels/{}.rels", slide_path)
38
+ }
39
+ }
40
+
41
+ pub(super) fn get_full_image_path(slide_path: &str, image_target: &str) -> String {
42
+ if image_target.starts_with("..") {
43
+ let parts: Vec<&str> = slide_path.rsplitn(3, '/').collect();
44
+ if parts.len() >= 3 {
45
+ format!("{}/{}", parts[2], &image_target[3..])
46
+ } else {
47
+ format!("ppt/{}", &image_target[3..])
48
+ }
49
+ } else {
50
+ let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
51
+ if parts.len() == 2 {
52
+ format!("{}/{}", parts[1], image_target)
53
+ } else {
54
+ format!("ppt/slides/{}", image_target)
55
+ }
56
+ }
57
+ }