kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -63,6 +63,8 @@ impl OcrBackend for MockOcrBackend {
63
63
  chunks: None,
64
64
  images: None,
65
65
  pages: None,
66
+ djot_content: None,
67
+ elements: None,
66
68
  })
67
69
  }
68
70
 
@@ -159,6 +161,8 @@ impl OcrBackend for ValidatingOcrBackend {
159
161
  chunks: None,
160
162
  images: None,
161
163
  pages: None,
164
+ djot_content: None,
165
+ elements: None,
162
166
  })
163
167
  }
164
168
 
@@ -216,6 +220,8 @@ impl OcrBackend for MetadataOcrBackend {
216
220
  chunks: None,
217
221
  images: None,
218
222
  pages: None,
223
+ djot_content: None,
224
+ elements: None,
219
225
  })
220
226
  }
221
227
 
@@ -298,6 +304,7 @@ fn test_ocr_backend_used_for_image_extraction() {
298
304
  backend: "extraction-test-ocr".to_string(),
299
305
  language: "eng".to_string(),
300
306
  tesseract_config: None,
307
+ output_format: None,
301
308
  };
302
309
 
303
310
  let config = ExtractionConfig {
@@ -357,6 +364,7 @@ fn test_ocr_backend_receives_correct_parameters() {
357
364
  backend: "param-test-ocr".to_string(),
358
365
  language: "deu".to_string(),
359
366
  tesseract_config: None,
367
+ output_format: None,
360
368
  };
361
369
 
362
370
  let config = ExtractionConfig {
@@ -405,6 +413,7 @@ fn test_ocr_backend_returns_correct_format() {
405
413
  backend: "format-test-ocr".to_string(),
406
414
  language: "eng".to_string(),
407
415
  tesseract_config: None,
416
+ output_format: None,
408
417
  };
409
418
 
410
419
  let config = ExtractionConfig {
@@ -455,6 +464,7 @@ fn test_ocr_backend_error_handling() {
455
464
  backend: "failing-ocr".to_string(),
456
465
  language: "eng".to_string(),
457
466
  tesseract_config: None,
467
+ output_format: None,
458
468
  };
459
469
 
460
470
  let config = ExtractionConfig {
@@ -505,6 +515,7 @@ fn test_ocr_backend_validation_error() {
505
515
  backend: "validating-ocr".to_string(),
506
516
  language: "eng".to_string(),
507
517
  tesseract_config: None,
518
+ output_format: None,
508
519
  };
509
520
 
510
521
  let config = ExtractionConfig {
@@ -567,6 +578,7 @@ fn test_switching_between_ocr_backends() {
567
578
  backend: "backend-1".to_string(),
568
579
  language: "eng".to_string(),
569
580
  tesseract_config: None,
581
+ output_format: None,
570
582
  };
571
583
 
572
584
  let config1 = ExtractionConfig {
@@ -585,6 +597,7 @@ fn test_switching_between_ocr_backends() {
585
597
  backend: "backend-2".to_string(),
586
598
  language: "eng".to_string(),
587
599
  tesseract_config: None,
600
+ output_format: None,
588
601
  };
589
602
 
590
603
  let config2 = ExtractionConfig {
@@ -59,6 +59,8 @@ impl DocumentExtractor for FailingExtractor {
59
59
  chunks: None,
60
60
  images: None,
61
61
  pages: None,
62
+ elements: None,
63
+ djot_content: None,
62
64
  })
63
65
  }
64
66
  }
@@ -304,6 +306,8 @@ fn test_extractor_priority_ordering_complex() {
304
306
  chunks: None,
305
307
  images: None,
306
308
  pages: None,
309
+ elements: None,
310
+ djot_content: None,
307
311
  })
308
312
  }
309
313
  fn supported_mime_types(&self) -> &[&str] {
@@ -464,6 +468,8 @@ async fn test_processor_execution_order_within_stage() {
464
468
  chunks: None,
465
469
  images: None,
466
470
  pages: None,
471
+ elements: None,
472
+ djot_content: None,
467
473
  };
468
474
 
469
475
  let config = ExtractionConfig::default();
@@ -496,6 +502,8 @@ async fn test_processor_error_propagation() {
496
502
  chunks: None,
497
503
  images: None,
498
504
  pages: None,
505
+ elements: None,
506
+ djot_content: None,
499
507
  };
500
508
 
501
509
  let config = ExtractionConfig::default();
@@ -668,6 +676,8 @@ async fn test_validator_content_validation() {
668
676
  chunks: None,
669
677
  images: None,
670
678
  pages: None,
679
+ elements: None,
680
+ djot_content: None,
671
681
  };
672
682
 
673
683
  let validation = validators[0].validate(&short_result, &config).await;
@@ -682,6 +692,8 @@ async fn test_validator_content_validation() {
682
692
  chunks: None,
683
693
  images: None,
684
694
  pages: None,
695
+ elements: None,
696
+ djot_content: None,
685
697
  };
686
698
 
687
699
  let validation = validators[0].validate(&long_result, &config).await;
@@ -0,0 +1,504 @@
1
+ //! Regression tests for PPTX/PPSX extraction bugs
2
+ //!
3
+ //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
4
+ //!
5
+ //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
6
+ //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
7
+
8
+ #![cfg(feature = "office")]
9
+
10
+ use kreuzberg::{ExtractionConfig, extract_file};
11
+ use std::io::Write;
12
+ use tempfile::NamedTempFile;
13
+ use zip::CompressionMethod;
14
+ use zip::write::{FileOptions, ZipWriter};
15
+
16
+ /// Test that PPSX (PowerPoint Show) files are extracted correctly.
17
+ ///
18
+ /// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
19
+ /// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
20
+ ///
21
+ /// The internal structure is identical to PPTX - same slide XML format.
22
+ ///
23
+ /// GitHub Issue #321 Bug 2
24
+ #[tokio::test]
25
+ async fn test_ppsx_slideshow_extraction() {
26
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
27
+ .parent()
28
+ .unwrap()
29
+ .parent()
30
+ .unwrap();
31
+ let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
32
+
33
+ if !test_file.exists() {
34
+ println!("Skipping test: PPSX test file not found at {:?}", test_file);
35
+ return;
36
+ }
37
+
38
+ let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;
39
+
40
+ match result {
41
+ Ok(extraction) => {
42
+ assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
43
+ println!("✅ PPSX extraction succeeded!");
44
+ println!(" Content length: {} chars", extraction.content.len());
45
+ println!(
46
+ " Content preview: {}",
47
+ &extraction.content[..extraction.content.len().min(200)]
48
+ );
49
+ }
50
+ Err(e) => {
51
+ panic!(
52
+ "PPSX extraction failed with error: {:?}\n\
53
+ This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
54
+ PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
55
+ needs to be added to extension-to-MIME mapping.",
56
+ e
57
+ );
58
+ }
59
+ }
60
+ }
61
+
62
+ /// Test that PPSX files can be extracted when MIME type is explicitly provided.
63
+ ///
64
+ /// This validates that the PPTX extractor can handle PPSX content correctly
65
+ /// (the XML structure is identical), even if MIME detection fails.
66
+ ///
67
+ /// GitHub Issue #321 Bug 2
68
+ #[tokio::test]
69
+ async fn test_ppsx_with_explicit_mime_type() {
70
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
71
+ .parent()
72
+ .unwrap()
73
+ .parent()
74
+ .unwrap();
75
+ let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
76
+
77
+ if !test_file.exists() {
78
+ println!("Skipping test: PPSX test file not found at {:?}", test_file);
79
+ return;
80
+ }
81
+
82
+ // Explicitly provide the PPSX MIME type
83
+ let result = extract_file(
84
+ &test_file,
85
+ Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
86
+ &ExtractionConfig::default(),
87
+ )
88
+ .await;
89
+
90
+ match result {
91
+ Ok(extraction) => {
92
+ assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
93
+ println!("✅ PPSX extraction with explicit MIME type succeeded!");
94
+ }
95
+ Err(e) => {
96
+ panic!(
97
+ "PPSX extraction with explicit MIME type failed: {:?}\n\
98
+ The PPTX extractor should handle PPSX content (identical XML structure).",
99
+ e
100
+ );
101
+ }
102
+ }
103
+ }
104
+
105
+ /// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
106
+ ///
107
+ /// Some shapes in PPTX files, like image placeholders (`<p:ph type="pic"/>`), don't have
108
+ /// `<p:txBody>` children because they're designed to hold images, not text.
109
+ ///
110
+ /// The parser should skip shapes without txBody gracefully instead of failing.
111
+ ///
112
+ /// GitHub Issue #321 Bug 1
113
+ #[tokio::test]
114
+ async fn test_pptx_with_image_placeholder_no_txbody() {
115
+ // Create a minimal PPTX with a shape that has no txBody (image placeholder)
116
+ let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
117
+
118
+ {
119
+ let mut zip = ZipWriter::new(&mut temp_file);
120
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
121
+
122
+ // Add [Content_Types].xml
123
+ zip.start_file("[Content_Types].xml", options).unwrap();
124
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
125
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
126
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
127
+ <Default Extension="xml" ContentType="application/xml"/>
128
+ <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
129
+ <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
130
+ </Types>"#).unwrap();
131
+
132
+ // Add _rels/.rels
133
+ zip.start_file("_rels/.rels", options).unwrap();
134
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
135
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
136
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
137
+ </Relationships>"#).unwrap();
138
+
139
+ // Add ppt/presentation.xml
140
+ zip.start_file("ppt/presentation.xml", options).unwrap();
141
+ zip.write_all(
142
+ br#"<?xml version="1.0" encoding="UTF-8"?>
143
+ <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
144
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
145
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
146
+ <p:sldIdLst>
147
+ <p:sldId id="256" r:id="rId2"/>
148
+ </p:sldIdLst>
149
+ </p:presentation>"#,
150
+ )
151
+ .unwrap();
152
+
153
+ // Add ppt/_rels/presentation.xml.rels
154
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
155
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
156
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
157
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
158
+ </Relationships>"#).unwrap();
159
+
160
+ // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
161
+ // This is the critical test case - a <p:sp> element with no <p:txBody>
162
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
163
+ zip.write_all(
164
+ br#"<?xml version="1.0" encoding="UTF-8"?>
165
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
166
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
167
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
168
+ <p:cSld>
169
+ <p:spTree>
170
+ <p:nvGrpSpPr>
171
+ <p:cNvPr id="1" name=""/>
172
+ <p:cNvGrpSpPr/>
173
+ <p:nvPr/>
174
+ </p:nvGrpSpPr>
175
+ <p:grpSpPr>
176
+ <a:xfrm>
177
+ <a:off x="0" y="0"/>
178
+ <a:ext cx="0" cy="0"/>
179
+ <a:chOff x="0" y="0"/>
180
+ <a:chExt cx="0" cy="0"/>
181
+ </a:xfrm>
182
+ </p:grpSpPr>
183
+
184
+ <!-- Normal text shape WITH txBody - this should be extracted -->
185
+ <p:sp>
186
+ <p:nvSpPr>
187
+ <p:cNvPr id="2" name="Title"/>
188
+ <p:cNvSpPr/>
189
+ <p:nvPr/>
190
+ </p:nvSpPr>
191
+ <p:spPr>
192
+ <a:xfrm>
193
+ <a:off x="0" y="0"/>
194
+ <a:ext cx="100000" cy="100000"/>
195
+ </a:xfrm>
196
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
197
+ </p:spPr>
198
+ <p:txBody>
199
+ <a:bodyPr/>
200
+ <a:lstStyle/>
201
+ <a:p>
202
+ <a:r>
203
+ <a:rPr lang="en-US"/>
204
+ <a:t>This is the title text</a:t>
205
+ </a:r>
206
+ </a:p>
207
+ </p:txBody>
208
+ </p:sp>
209
+
210
+ <!-- IMAGE PLACEHOLDER shape WITHOUT txBody - this caused the "No txBody found" error -->
211
+ <!-- This is a valid PPTX structure - image placeholders don't contain text -->
212
+ <p:sp>
213
+ <p:nvSpPr>
214
+ <p:cNvPr id="99" name="Image Placeholder"/>
215
+ <p:cNvSpPr>
216
+ <a:spLocks noGrp="1"/>
217
+ </p:cNvSpPr>
218
+ <p:nvPr>
219
+ <p:ph type="pic" idx="1"/>
220
+ </p:nvPr>
221
+ </p:nvSpPr>
222
+ <p:spPr>
223
+ <a:xfrm>
224
+ <a:off x="0" y="0"/>
225
+ <a:ext cx="100000" cy="100000"/>
226
+ </a:xfrm>
227
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
228
+ </p:spPr>
229
+ <!-- NOTE: No <p:txBody> here - this is valid for image placeholders -->
230
+ </p:sp>
231
+
232
+ <!-- Another normal text shape - should also be extracted -->
233
+ <p:sp>
234
+ <p:nvSpPr>
235
+ <p:cNvPr id="3" name="Content"/>
236
+ <p:cNvSpPr/>
237
+ <p:nvPr/>
238
+ </p:nvSpPr>
239
+ <p:spPr>
240
+ <a:xfrm>
241
+ <a:off x="0" y="200000"/>
242
+ <a:ext cx="100000" cy="100000"/>
243
+ </a:xfrm>
244
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
245
+ </p:spPr>
246
+ <p:txBody>
247
+ <a:bodyPr/>
248
+ <a:lstStyle/>
249
+ <a:p>
250
+ <a:r>
251
+ <a:rPr lang="en-US"/>
252
+ <a:t>Content after image placeholder</a:t>
253
+ </a:r>
254
+ </a:p>
255
+ </p:txBody>
256
+ </p:sp>
257
+
258
+ </p:spTree>
259
+ </p:cSld>
260
+ </p:sld>"#,
261
+ )
262
+ .unwrap();
263
+
264
+ // Add ppt/slides/_rels/slide1.xml.rels (empty)
265
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
266
+ zip.write_all(
267
+ br#"<?xml version="1.0" encoding="UTF-8"?>
268
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
269
+ </Relationships>"#,
270
+ )
271
+ .unwrap();
272
+
273
+ zip.finish().unwrap();
274
+ }
275
+
276
+ // Extract the PPTX file
277
+ let result = extract_file(
278
+ temp_file.path(),
279
+ Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
280
+ &ExtractionConfig::default(),
281
+ )
282
+ .await;
283
+
284
+ match result {
285
+ Ok(extraction) => {
286
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
287
+
288
+ // Verify we extracted text from shapes that DO have txBody
289
+ assert!(
290
+ extraction.content.contains("title text"),
291
+ "Should extract text from first shape with txBody. Got: {}",
292
+ extraction.content
293
+ );
294
+ assert!(
295
+ extraction.content.contains("Content after"),
296
+ "Should extract text from shape after image placeholder. Got: {}",
297
+ extraction.content
298
+ );
299
+
300
+ println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
301
+ println!(" Content: {}", extraction.content);
302
+ }
303
+ Err(e) => {
304
+ let error_msg = format!("{:?}", e);
305
+ if error_msg.contains("No txBody found") {
306
+ panic!(
307
+ "PPTX extraction failed with 'No txBody found' error!\n\
308
+ This is GitHub Issue #321 Bug 1.\n\
309
+ The parser should skip shapes without txBody (image placeholders) \
310
+ instead of failing.\n\
311
+ Error: {:?}",
312
+ e
313
+ );
314
+ } else {
315
+ panic!("PPTX extraction failed with unexpected error: {:?}", e);
316
+ }
317
+ }
318
+ }
319
+ }
320
+
321
+ /// Test extraction of PPTX with multiple shapes, some with txBody, some without.
322
+ ///
323
+ /// This test verifies that:
324
+ /// 1. Shapes WITH txBody are extracted
325
+ /// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
326
+ /// 3. The extraction continues and doesn't fail on the first shape without txBody
327
+ ///
328
+ /// GitHub Issue #321 Bug 1
329
+ #[tokio::test]
330
+ async fn test_pptx_mixed_shapes_extraction() {
331
+ // Create a PPTX with multiple slides, each containing mixed shapes
332
+ let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
333
+
334
+ {
335
+ let mut zip = ZipWriter::new(&mut temp_file);
336
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
337
+
338
+ // Add [Content_Types].xml
339
+ zip.start_file("[Content_Types].xml", options).unwrap();
340
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
341
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
342
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
343
+ <Default Extension="xml" ContentType="application/xml"/>
344
+ <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
345
+ <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
346
+ </Types>"#).unwrap();
347
+
348
+ // Add _rels/.rels
349
+ zip.start_file("_rels/.rels", options).unwrap();
350
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
351
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
352
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
353
+ </Relationships>"#).unwrap();
354
+
355
+ // Add ppt/presentation.xml
356
+ zip.start_file("ppt/presentation.xml", options).unwrap();
357
+ zip.write_all(
358
+ br#"<?xml version="1.0" encoding="UTF-8"?>
359
+ <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
360
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
361
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
362
+ <p:sldIdLst>
363
+ <p:sldId id="256" r:id="rId2"/>
364
+ </p:sldIdLst>
365
+ </p:presentation>"#,
366
+ )
367
+ .unwrap();
368
+
369
+ // Add ppt/_rels/presentation.xml.rels
370
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
371
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
372
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
373
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
374
+ </Relationships>"#).unwrap();
375
+
376
+ // Add slide with various shapes - some with txBody, some without
377
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
378
+ zip.write_all(
379
+ br#"<?xml version="1.0" encoding="UTF-8"?>
380
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
381
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
382
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
383
+ <p:cSld>
384
+ <p:spTree>
385
+ <p:nvGrpSpPr>
386
+ <p:cNvPr id="1" name=""/>
387
+ <p:cNvGrpSpPr/>
388
+ <p:nvPr/>
389
+ </p:nvGrpSpPr>
390
+ <p:grpSpPr/>
391
+
392
+ <!-- Shape 1: Normal text -->
393
+ <p:sp>
394
+ <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
395
+ <p:spPr/>
396
+ <p:txBody>
397
+ <a:bodyPr/><a:lstStyle/>
398
+ <a:p><a:r><a:t>First Text Shape</a:t></a:r></a:p>
399
+ </p:txBody>
400
+ </p:sp>
401
+
402
+ <!-- Shape 2: Image placeholder (NO txBody) -->
403
+ <p:sp>
404
+ <p:nvSpPr>
405
+ <p:cNvPr id="10" name="Picture Placeholder"/>
406
+ <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
407
+ <p:nvPr><p:ph type="pic"/></p:nvPr>
408
+ </p:nvSpPr>
409
+ <p:spPr/>
410
+ </p:sp>
411
+
412
+ <!-- Shape 3: Another text shape -->
413
+ <p:sp>
414
+ <p:nvSpPr><p:cNvPr id="3" name="Body"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
415
+ <p:spPr/>
416
+ <p:txBody>
417
+ <a:bodyPr/><a:lstStyle/>
418
+ <a:p><a:r><a:t>Second Text Shape</a:t></a:r></a:p>
419
+ </p:txBody>
420
+ </p:sp>
421
+
422
+ <!-- Shape 4: Chart placeholder (NO txBody) -->
423
+ <p:sp>
424
+ <p:nvSpPr>
425
+ <p:cNvPr id="11" name="Chart Placeholder"/>
426
+ <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
427
+ <p:nvPr><p:ph type="chart"/></p:nvPr>
428
+ </p:nvSpPr>
429
+ <p:spPr/>
430
+ </p:sp>
431
+
432
+ <!-- Shape 5: Content placeholder (NO txBody - empty) -->
433
+ <p:sp>
434
+ <p:nvSpPr>
435
+ <p:cNvPr id="12" name="Content Placeholder"/>
436
+ <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
437
+ <p:nvPr><p:ph type="body"/></p:nvPr>
438
+ </p:nvSpPr>
439
+ <p:spPr/>
440
+ </p:sp>
441
+
442
+ <!-- Shape 6: Final text shape -->
443
+ <p:sp>
444
+ <p:nvSpPr><p:cNvPr id="4" name="Footer"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
445
+ <p:spPr/>
446
+ <p:txBody>
447
+ <a:bodyPr/><a:lstStyle/>
448
+ <a:p><a:r><a:t>Third Text Shape</a:t></a:r></a:p>
449
+ </p:txBody>
450
+ </p:sp>
451
+
452
+ </p:spTree>
453
+ </p:cSld>
454
+ </p:sld>"#,
455
+ )
456
+ .unwrap();
457
+
458
+ // Add empty rels
459
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
460
+ zip.write_all(
461
+ br#"<?xml version="1.0" encoding="UTF-8"?>
462
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
463
+ </Relationships>"#,
464
+ )
465
+ .unwrap();
466
+
467
+ zip.finish().unwrap();
468
+ }
469
+
470
+ let result = extract_file(
471
+ temp_file.path(),
472
+ Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
473
+ &ExtractionConfig::default(),
474
+ )
475
+ .await;
476
+
477
+ match result {
478
+ Ok(extraction) => {
479
+ // All three text shapes should be extracted
480
+ assert!(
481
+ extraction.content.contains("First Text Shape"),
482
+ "Should extract first text shape"
483
+ );
484
+ assert!(
485
+ extraction.content.contains("Second Text Shape"),
486
+ "Should extract second text shape (after image placeholder)"
487
+ );
488
+ assert!(
489
+ extraction.content.contains("Third Text Shape"),
490
+ "Should extract third text shape (after multiple placeholders)"
491
+ );
492
+
493
+ println!("✅ PPTX mixed shapes extraction succeeded!");
494
+ println!(" All text shapes extracted despite image/chart/content placeholders without txBody");
495
+ }
496
+ Err(e) => {
497
+ panic!(
498
+ "PPTX extraction failed: {:?}\n\
499
+ Shapes without txBody should be skipped gracefully.",
500
+ e
501
+ );
502
+ }
503
+ }
504
+ }
@@ -126,6 +126,8 @@ impl DocumentExtractor for MockExtractor {
126
126
  chunks: None,
127
127
  images: None,
128
128
  pages: None,
129
+ elements: None,
130
+ djot_content: None,
129
131
  })
130
132
  }
131
133
 
@@ -28,8 +28,9 @@ serde_json = { workspace = true }
28
28
  serde = { workspace = true }
29
29
  async-trait = { workspace = true }
30
30
  tokio = { workspace = true }
31
- html-to-markdown-rs = { version = "2.22.5", default-features = false }
31
+ html-to-markdown-rs = { version = "2.23.4", default-features = false }
32
32
  rayon = { version = "1.11", optional = true }
33
+ log = "0.4"
33
34
 
34
35
  [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
35
36
  kreuzberg = { path = "../kreuzberg", features = [
@@ -70,6 +70,8 @@ fn create_test_result(content_size: usize, chunk_count: usize) -> ExtractionResu
70
70
  chunks,
71
71
  images: None,
72
72
  pages: None,
73
+ elements: None,
74
+ djot_content: None,
73
75
  }
74
76
  }
75
77