kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,103 @@
1
+ //! Stack management for HTML extraction with support for large documents.
2
+ //!
3
+ //! This module handles the specialized concern of managing stack size for HTML conversion,
4
+ //! particularly for large HTML documents that may require more stack space than the default.
5
+ //! On WASM, stack size is limited and cannot be increased, so size limits are enforced.
6
+ //! On native platforms, dedicated threads with larger stacks are used for large HTML.
7
+
8
+ use crate::error::{KreuzbergError, Result};
9
+
10
+ #[cfg(not(target_arch = "wasm32"))]
11
+ use std::{any::Any, thread};
12
+
13
+ #[cfg(target_arch = "wasm32")]
14
+ pub const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024;
15
+
16
+ #[cfg(not(target_arch = "wasm32"))]
17
+ pub const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
18
+
19
+ #[cfg(not(target_arch = "wasm32"))]
20
+ pub const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
21
+
22
+ /// Check if HTML size exceeds WASM limit and return error if so.
23
+ ///
24
+ /// WASM builds have a fixed stack size that cannot be increased, so we enforce
25
+ /// a 2MB limit to prevent stack overflow during HTML conversion.
26
+ #[cfg(target_arch = "wasm32")]
27
+ pub fn check_wasm_size_limit(html: &str) -> Result<()> {
28
+ if html.len() > MAX_HTML_SIZE_BYTES {
29
+ return Err(KreuzbergError::validation(format!(
30
+ "HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
31
+ Large HTML files cannot be processed in WASM due to stack constraints. \
32
+ Consider using the native library for files of this size.",
33
+ html.len(),
34
+ MAX_HTML_SIZE_BYTES
35
+ )));
36
+ }
37
+ Ok(())
38
+ }
39
+
40
+ /// Check if HTML size exceeds WASM limit and return error if so.
41
+ ///
42
+ /// No-op on non-WASM platforms.
43
+ #[cfg(not(target_arch = "wasm32"))]
44
+ pub fn check_wasm_size_limit(_html: &str) -> Result<()> {
45
+ Ok(())
46
+ }
47
+
48
+ /// Determine if HTML requires a dedicated stack due to size.
49
+ ///
50
+ /// On native platforms, HTML larger than the threshold will be processed
51
+ /// on a dedicated thread with a larger stack to prevent overflow.
52
+ #[cfg(not(target_arch = "wasm32"))]
53
+ pub fn html_requires_large_stack(len: usize) -> bool {
54
+ len >= LARGE_HTML_STACK_THRESHOLD_BYTES
55
+ }
56
+
57
+ /// Run a job on a dedicated thread with a large stack.
58
+ ///
59
+ /// This is useful for HTML conversion of large documents that might
60
+ /// overflow the default thread stack on native platforms.
61
+ ///
62
+ /// # Arguments
63
+ ///
64
+ /// * `job` - The closure to execute on the dedicated thread
65
+ ///
66
+ /// # Returns
67
+ ///
68
+ /// The result of the job execution, or an error if the thread panics
69
+ #[cfg(not(target_arch = "wasm32"))]
70
+ pub fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
71
+ where
72
+ T: Send + 'static,
73
+ F: FnOnce() -> Result<T> + Send + 'static,
74
+ {
75
+ let handle = thread::Builder::new()
76
+ .name("kreuzberg-html-conversion".to_string())
77
+ .stack_size(HTML_CONVERSION_STACK_SIZE_BYTES)
78
+ .spawn(job)
79
+ .map_err(|err| KreuzbergError::Other(format!("Failed to spawn HTML conversion thread: {}", err)))?;
80
+
81
+ match handle.join() {
82
+ Ok(result) => result,
83
+ Err(panic) => {
84
+ let reason = extract_panic_reason(&panic);
85
+ Err(KreuzbergError::Other(format!("HTML conversion panicked: {}", reason)))
86
+ }
87
+ }
88
+ }
89
+
90
+ /// Extract a human-readable reason from a panic.
91
+ ///
92
+ /// Attempts to downcast the panic value to either &str or String,
93
+ /// falling back to a generic message if neither succeeds.
94
+ #[cfg(not(target_arch = "wasm32"))]
95
+ fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
96
+ if let Some(msg) = panic.downcast_ref::<&str>() {
97
+ (*msg).to_string()
98
+ } else if let Some(msg) = panic.downcast_ref::<String>() {
99
+ msg.clone()
100
+ } else {
101
+ "unknown panic".to_string()
102
+ }
103
+ }
@@ -0,0 +1,28 @@
1
+ //! Type definitions for HTML extraction.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+ use std::collections::HashMap;
5
+
6
+ pub use html_to_markdown_rs::{
7
+ CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
8
+ PreprocessingPreset, WhitespaceMode,
9
+ };
10
+
11
+ /// Result of HTML extraction with optional images and warnings.
12
+ #[derive(Debug, Clone, Serialize, Deserialize)]
13
+ pub struct HtmlExtractionResult {
14
+ pub markdown: String,
15
+ pub images: Vec<ExtractedInlineImage>,
16
+ pub warnings: Vec<String>,
17
+ }
18
+
19
+ /// Extracted inline image with metadata.
20
+ #[derive(Debug, Clone, Serialize, Deserialize)]
21
+ pub struct ExtractedInlineImage {
22
+ pub data: Vec<u8>,
23
+ pub format: String,
24
+ pub filename: Option<String>,
25
+ pub description: Option<String>,
26
+ pub dimensions: Option<(u32, u32)>,
27
+ pub attributes: HashMap<String, String>,
28
+ }
@@ -1,5 +1,6 @@
1
1
  pub mod structured;
2
2
  pub mod text;
3
+ pub mod transform;
3
4
 
4
5
  #[cfg(feature = "ocr")]
5
6
  pub mod image;
@@ -41,11 +42,14 @@ pub mod table;
41
42
  #[cfg(feature = "xml")]
42
43
  pub mod xml;
43
44
 
44
- #[cfg(any(feature = "office", feature = "html"))]
45
+ #[cfg(any(feature = "office", feature = "html", feature = "xml"))]
45
46
  pub mod markdown;
46
47
 
47
48
  pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
48
49
  pub use text::parse_text;
50
+ pub use transform::{
51
+ ListItemMetadata, ListType, detect_list_items, generate_element_id, transform_extraction_result_to_elements,
52
+ };
49
53
 
50
54
  #[cfg(feature = "ocr")]
51
55
  pub use image::{ImageMetadata, extract_image_metadata};
@@ -84,7 +88,7 @@ pub use table::table_from_arrow_to_markdown;
84
88
  #[cfg(feature = "xml")]
85
89
  pub use xml::parse_xml;
86
90
 
87
- #[cfg(any(feature = "office", feature = "html"))]
91
+ #[cfg(any(feature = "office", feature = "html", feature = "xml"))]
88
92
  pub use markdown::cells_to_markdown;
89
93
 
90
94
  pub use capacity::{
@@ -0,0 +1,159 @@
1
+ //! PPTX container and ZIP archive management.
2
+ //!
3
+ //! This module handles opening PPTX files, reading files from the ZIP archive,
4
+ //! finding slide paths, and iterating through slides.
5
+
6
+ use std::collections::HashMap;
7
+ use std::fs::File;
8
+ use std::io::Read;
9
+ use std::path::Path;
10
+ use zip::ZipArchive;
11
+
12
+ use super::elements::Slide;
13
+ use super::image_handling::get_full_image_path;
14
+ use crate::error::{KreuzbergError, Result};
15
+
16
+ pub(super) struct PptxContainer {
17
+ pub(super) archive: ZipArchive<File>,
18
+ slide_paths: Vec<String>,
19
+ }
20
+
21
+ impl PptxContainer {
22
+ pub(super) fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
23
+ // IO errors must bubble up unchanged - file access issues need user reports ~keep
24
+ let file = File::open(path)?;
25
+
26
+ let mut archive = match ZipArchive::new(file) {
27
+ Ok(arc) => arc,
28
+ Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
29
+ Err(e) => {
30
+ return Err(KreuzbergError::parsing(format!(
31
+ "Failed to read PPTX archive (invalid format): {}",
32
+ e
33
+ )));
34
+ }
35
+ };
36
+
37
+ let slide_paths = Self::find_slide_paths(&mut archive)?;
38
+
39
+ Ok(Self { archive, slide_paths })
40
+ }
41
+
42
+ pub(super) fn slide_paths(&self) -> &[String] {
43
+ &self.slide_paths
44
+ }
45
+
46
+ pub(super) fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
47
+ match self.archive.by_name(path) {
48
+ Ok(mut file) => {
49
+ let mut contents = Vec::new();
50
+ // IO errors must bubble up - file read issues need user reports ~keep
51
+ file.read_to_end(&mut contents)?;
52
+ Ok(contents)
53
+ }
54
+ Err(zip::result::ZipError::FileNotFound) => {
55
+ Err(KreuzbergError::parsing("File not found in archive".to_string()))
56
+ }
57
+ Err(zip::result::ZipError::Io(io_err)) => Err(io_err.into()), // Bubble up IO errors ~keep
58
+ Err(e) => Err(KreuzbergError::parsing(format!("Zip error: {}", e))),
59
+ }
60
+ }
61
+
62
+ pub(super) fn get_slide_rels_path(&self, slide_path: &str) -> String {
63
+ super::image_handling::get_slide_rels_path(slide_path)
64
+ }
65
+
66
+ fn find_slide_paths(archive: &mut ZipArchive<File>) -> Result<Vec<String>> {
67
+ if let Ok(rels_data) = Self::read_file_from_archive(archive, "ppt/_rels/presentation.xml.rels")
68
+ && let Ok(paths) = super::parser::parse_presentation_rels(&rels_data)
69
+ {
70
+ return Ok(paths);
71
+ }
72
+
73
+ let mut slide_paths = Vec::new();
74
+ for i in 0..archive.len() {
75
+ if let Ok(file) = archive.by_index(i) {
76
+ let name = file.name();
77
+ if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
78
+ slide_paths.push(name.to_string());
79
+ }
80
+ }
81
+ }
82
+
83
+ slide_paths.sort();
84
+ Ok(slide_paths)
85
+ }
86
+
87
+ fn read_file_from_archive(archive: &mut ZipArchive<File>, path: &str) -> Result<Vec<u8>> {
88
+ let mut file = match archive.by_name(path) {
89
+ Ok(f) => f,
90
+ Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
91
+ Err(e) => {
92
+ return Err(KreuzbergError::parsing(format!(
93
+ "Failed to read file from archive: {}",
94
+ e
95
+ )));
96
+ }
97
+ };
98
+ let mut contents = Vec::new();
99
+ // IO errors must bubble up - file read issues need user reports ~keep
100
+ file.read_to_end(&mut contents)?;
101
+ Ok(contents)
102
+ }
103
+ }
104
+
105
+ pub(super) struct SlideIterator {
106
+ container: PptxContainer,
107
+ current_index: usize,
108
+ total_slides: usize,
109
+ }
110
+
111
+ impl SlideIterator {
112
+ pub(super) fn new(container: PptxContainer) -> Self {
113
+ let total_slides = container.slide_paths().len();
114
+ Self {
115
+ container,
116
+ current_index: 0,
117
+ total_slides,
118
+ }
119
+ }
120
+
121
+ pub(super) fn slide_count(&self) -> usize {
122
+ self.total_slides
123
+ }
124
+
125
+ pub(super) fn next_slide(&mut self) -> Result<Option<Slide>> {
126
+ if self.current_index >= self.total_slides {
127
+ return Ok(None);
128
+ }
129
+
130
+ let slide_path = &self.container.slide_paths()[self.current_index].clone();
131
+ let slide_number = (self.current_index + 1) as u32;
132
+
133
+ let xml_data = self.container.read_file(slide_path)?;
134
+
135
+ let rels_path = self.container.get_slide_rels_path(slide_path);
136
+ let rels_data = self.container.read_file(&rels_path).ok();
137
+
138
+ let slide = Slide::from_xml(slide_number, &xml_data, rels_data.as_deref())?;
139
+
140
+ self.current_index += 1;
141
+
142
+ Ok(Some(slide))
143
+ }
144
+
145
+ pub(super) fn get_slide_images(&mut self, slide: &Slide) -> Result<HashMap<String, Vec<u8>>> {
146
+ let mut image_data = HashMap::new();
147
+
148
+ for img_ref in &slide.images {
149
+ let slide_path = &self.container.slide_paths()[slide.slide_number as usize - 1];
150
+ let full_path = get_full_image_path(slide_path, &img_ref.target);
151
+
152
+ if let Ok(data) = self.container.read_file(&full_path) {
153
+ image_data.insert(img_ref.id.clone(), data);
154
+ }
155
+ }
156
+
157
+ Ok(image_data)
158
+ }
159
+ }
@@ -0,0 +1,168 @@
1
+ //! Content builder for accumulating slide output.
2
+ //!
3
+ //! This module provides utilities for building the final markdown content
4
+ //! from slide elements and managing page boundaries.
5
+
6
+ pub(super) struct ContentBuilder {
7
+ pub(super) content: String,
8
+ pub(super) boundaries: Vec<crate::types::PageBoundary>,
9
+ pub(super) page_contents: Vec<crate::types::PageContent>,
10
+ pub(super) config: Option<crate::core::config::PageConfig>,
11
+ }
12
+
13
+ impl ContentBuilder {
14
+ pub(super) fn new() -> Self {
15
+ Self {
16
+ content: String::with_capacity(8192),
17
+ boundaries: Vec::new(),
18
+ page_contents: Vec::new(),
19
+ config: None,
20
+ }
21
+ }
22
+
23
+ pub(super) fn with_page_config(capacity: usize, config: Option<crate::core::config::PageConfig>) -> Self {
24
+ Self {
25
+ content: String::with_capacity(capacity),
26
+ boundaries: if config.is_some() {
27
+ Vec::new()
28
+ } else {
29
+ Vec::with_capacity(0)
30
+ },
31
+ page_contents: if config.is_some() {
32
+ Vec::new()
33
+ } else {
34
+ Vec::with_capacity(0)
35
+ },
36
+ config,
37
+ }
38
+ }
39
+
40
+ pub(super) fn start_slide(&mut self, slide_number: u32) -> usize {
41
+ let byte_start = self.content.len();
42
+
43
+ if let Some(ref cfg) = self.config
44
+ && cfg.insert_page_markers
45
+ {
46
+ let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
47
+ self.content.push_str(&marker);
48
+ }
49
+
50
+ byte_start
51
+ }
52
+
53
+ pub(super) fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
54
+ let byte_end = self.content.len();
55
+
56
+ if self.config.is_some() {
57
+ self.boundaries.push(crate::types::PageBoundary {
58
+ byte_start,
59
+ byte_end,
60
+ page_number: slide_number as usize,
61
+ });
62
+
63
+ self.page_contents.push(crate::types::PageContent {
64
+ page_number: slide_number as usize,
65
+ content: slide_content,
66
+ tables: Vec::new(),
67
+ images: Vec::new(),
68
+ hierarchy: None,
69
+ });
70
+ }
71
+ }
72
+
73
+ pub(super) fn add_slide_header(&mut self, slide_number: u32) {
74
+ self.content.reserve(50);
75
+ self.content.push_str("\n\n<!-- Slide number: ");
76
+ self.content.push_str(&slide_number.to_string());
77
+ self.content.push_str(" -->\n");
78
+ }
79
+
80
+ pub(super) fn add_text(&mut self, text: &str) {
81
+ if !text.trim().is_empty() {
82
+ self.content.push_str(text);
83
+ }
84
+ }
85
+
86
+ pub(super) fn add_title(&mut self, title: &str) {
87
+ if !title.trim().is_empty() {
88
+ self.content.push_str("# ");
89
+ self.content.push_str(title.trim());
90
+ self.content.push('\n');
91
+ }
92
+ }
93
+
94
+ pub(super) fn add_table(&mut self, rows: &[Vec<String>]) {
95
+ if rows.is_empty() {
96
+ return;
97
+ }
98
+
99
+ self.content.push_str("\n<table>");
100
+ for (i, row) in rows.iter().enumerate() {
101
+ self.content.push_str("<tr>");
102
+ let tag = if i == 0 { "th" } else { "td" };
103
+
104
+ for cell in row {
105
+ self.content.push('<');
106
+ self.content.push_str(tag);
107
+ self.content.push('>');
108
+ self.content.push_str(&super::image_handling::html_escape(cell));
109
+ self.content.push_str("</");
110
+ self.content.push_str(tag);
111
+ self.content.push('>');
112
+ }
113
+ self.content.push_str("</tr>");
114
+ }
115
+ self.content.push_str("</table>\n");
116
+ }
117
+
118
+ pub(super) fn add_list_item(&mut self, level: u32, is_ordered: bool, text: &str) {
119
+ let indent_count = level.saturating_sub(1) as usize;
120
+ for _ in 0..indent_count {
121
+ self.content.push_str(" ");
122
+ }
123
+
124
+ let marker = if is_ordered { "1." } else { "-" };
125
+ self.content.push_str(marker);
126
+ self.content.push(' ');
127
+ self.content.push_str(text.trim());
128
+ self.content.push('\n');
129
+ }
130
+
131
+ pub(super) fn add_image(&mut self, image_id: &str, slide_number: u32) {
132
+ let filename = format!("slide_{}_image_{}.jpg", slide_number, image_id);
133
+ self.content.push_str("![");
134
+ self.content.push_str(image_id);
135
+ self.content.push_str("](");
136
+ self.content.push_str(&filename);
137
+ self.content.push_str(")\n");
138
+ }
139
+
140
+ pub(super) fn add_notes(&mut self, notes: &str) {
141
+ if !notes.trim().is_empty() {
142
+ self.content.push_str("\n\n### Notes:\n");
143
+ self.content.push_str(notes);
144
+ self.content.push('\n');
145
+ }
146
+ }
147
+
148
+ pub(super) fn build(
149
+ self,
150
+ ) -> (
151
+ String,
152
+ Option<Vec<crate::types::PageBoundary>>,
153
+ Option<Vec<crate::types::PageContent>>,
154
+ ) {
155
+ let content = self.content.trim().to_string();
156
+ let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
157
+ Some(self.boundaries)
158
+ } else {
159
+ None
160
+ };
161
+ let pages = if self.config.is_some() && !self.page_contents.is_empty() {
162
+ Some(self.page_contents)
163
+ } else {
164
+ None
165
+ };
166
+ (content, boundaries, pages)
167
+ }
168
+ }
@@ -0,0 +1,132 @@
1
+ //! Internal types for PPTX extraction.
2
+ //!
3
+ //! This module defines the internal data structures used to represent
4
+ //! slide elements, formatting, and text runs during XML parsing.
5
+
6
+ #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
7
+ pub(super) struct ElementPosition {
8
+ pub(super) x: i64,
9
+ pub(super) y: i64,
10
+ }
11
+
12
+ #[derive(Debug, Clone, Default)]
13
+ pub(super) struct Formatting {
14
+ pub(super) bold: bool,
15
+ pub(super) italic: bool,
16
+ pub(super) underlined: bool,
17
+ pub(super) lang: String,
18
+ }
19
+
20
+ #[derive(Debug, Clone)]
21
+ pub(super) struct Run {
22
+ pub(super) text: String,
23
+ pub(super) formatting: Formatting,
24
+ }
25
+
26
+ impl Run {
27
+ pub(super) fn extract(&self) -> String {
28
+ self.text.clone()
29
+ }
30
+
31
+ pub(super) fn render_as_md(&self) -> String {
32
+ let mut result = self.text.clone();
33
+
34
+ if self.formatting.bold {
35
+ result = format!("**{}**", result);
36
+ }
37
+ if self.formatting.italic {
38
+ result = format!("*{}*", result);
39
+ }
40
+ if self.formatting.underlined {
41
+ result = format!("<u>{}</u>", result);
42
+ }
43
+
44
+ result
45
+ }
46
+ }
47
+
48
+ #[derive(Debug, Clone)]
49
+ pub(super) struct TextElement {
50
+ pub(super) runs: Vec<Run>,
51
+ }
52
+
53
+ #[derive(Debug, Clone)]
54
+ pub(super) struct ListItem {
55
+ pub(super) level: u32,
56
+ pub(super) is_ordered: bool,
57
+ pub(super) runs: Vec<Run>,
58
+ }
59
+
60
+ #[derive(Debug, Clone)]
61
+ pub(super) struct ListElement {
62
+ pub(super) items: Vec<ListItem>,
63
+ }
64
+
65
+ #[derive(Debug, Clone)]
66
+ pub(super) struct TableCell {
67
+ pub(super) runs: Vec<Run>,
68
+ }
69
+
70
+ #[derive(Debug, Clone)]
71
+ pub(super) struct TableRow {
72
+ pub(super) cells: Vec<TableCell>,
73
+ }
74
+
75
+ #[derive(Debug, Clone)]
76
+ pub(super) struct TableElement {
77
+ pub(super) rows: Vec<TableRow>,
78
+ }
79
+
80
+ #[derive(Debug, Clone)]
81
+ pub(super) struct ImageReference {
82
+ pub(super) id: String,
83
+ pub(super) target: String,
84
+ }
85
+
86
+ #[derive(Debug, Clone)]
87
+ pub(super) enum SlideElement {
88
+ Text(TextElement, ElementPosition),
89
+ Table(TableElement, ElementPosition),
90
+ Image(ImageReference, ElementPosition),
91
+ List(ListElement, ElementPosition),
92
+ Unknown,
93
+ }
94
+
95
+ impl SlideElement {
96
+ pub(super) fn position(&self) -> ElementPosition {
97
+ match self {
98
+ SlideElement::Text(_, pos)
99
+ | SlideElement::Table(_, pos)
100
+ | SlideElement::Image(_, pos)
101
+ | SlideElement::List(_, pos) => *pos,
102
+ SlideElement::Unknown => ElementPosition::default(),
103
+ }
104
+ }
105
+ }
106
+
107
+ #[derive(Debug)]
108
+ pub(super) struct Slide {
109
+ pub(super) slide_number: u32,
110
+ pub(super) elements: Vec<SlideElement>,
111
+ pub(super) images: Vec<ImageReference>,
112
+ }
113
+
114
+ #[derive(Debug, Clone)]
115
+ pub(super) struct ParserConfig {
116
+ pub(super) extract_images: bool,
117
+ pub(super) include_slide_comment: bool,
118
+ }
119
+
120
+ impl Default for ParserConfig {
121
+ fn default() -> Self {
122
+ Self {
123
+ extract_images: true,
124
+ include_slide_comment: false,
125
+ }
126
+ }
127
+ }
128
+
129
+ pub(super) enum ParsedContent {
130
+ Text(TextElement),
131
+ List(ListElement),
132
+ }
@@ -0,0 +1,57 @@
1
+ //! Image parsing and format detection.
2
+ //!
3
+ //! This module handles image-related parsing from slide XML and
4
+ //! detection of image formats from file data.
5
+
6
+ pub(super) fn html_escape(text: &str) -> String {
7
+ text.replace('&', "&amp;")
8
+ .replace('<', "&lt;")
9
+ .replace('>', "&gt;")
10
+ .replace('"', "&quot;")
11
+ .replace('\'', "&#x27;")
12
+ }
13
+
14
+ pub(super) fn detect_image_format(data: &[u8]) -> String {
15
+ if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
16
+ "jpeg".to_string()
17
+ } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
18
+ "png".to_string()
19
+ } else if data.starts_with(b"GIF") {
20
+ "gif".to_string()
21
+ } else if data.starts_with(b"BM") {
22
+ "bmp".to_string()
23
+ } else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
24
+ "svg".to_string()
25
+ } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
26
+ "tiff".to_string()
27
+ } else {
28
+ "unknown".to_string()
29
+ }
30
+ }
31
+
32
+ pub(super) fn get_slide_rels_path(slide_path: &str) -> String {
33
+ let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
34
+ if parts.len() == 2 {
35
+ format!("{}/_rels/{}.rels", parts[1], parts[0])
36
+ } else {
37
+ format!("_rels/{}.rels", slide_path)
38
+ }
39
+ }
40
+
41
+ pub(super) fn get_full_image_path(slide_path: &str, image_target: &str) -> String {
42
+ if image_target.starts_with("..") {
43
+ let parts: Vec<&str> = slide_path.rsplitn(3, '/').collect();
44
+ if parts.len() >= 3 {
45
+ format!("{}/{}", parts[2], &image_target[3..])
46
+ } else {
47
+ format!("ppt/{}", &image_target[3..])
48
+ }
49
+ } else {
50
+ let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
51
+ if parts.len() == 2 {
52
+ format!("{}/{}", parts[1], image_target)
53
+ } else {
54
+ format!("ppt/slides/{}", image_target)
55
+ }
56
+ }
57
+ }