kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,558 @@
1
+ //! PowerPoint presentation extraction functions.
2
+ //!
3
+ //! This module provides PowerPoint (PPTX) file parsing by directly reading the Office Open XML
4
+ //! format. It extracts text content, slide structure, images, and presentation metadata.
5
+ //!
6
+ //! # Attribution
7
+ //!
8
+ //! This code is based on the [pptx-to-md](https://github.com/nilskruthoff/pptx-parser) library
9
+ //! by Nils Kruthoff, licensed under MIT OR Apache-2.0. The original code has been vendored and
10
+ //! adapted to integrate with Kreuzberg's architecture. See ATTRIBUTIONS.md for full license text.
11
+ //!
12
+ //! # Features
13
+ //!
14
+ //! - **Slide extraction**: Reads all slides from presentation
15
+ //! - **Text formatting**: Preserves bold, italic, underline formatting as Markdown
16
+ //! - **Image extraction**: Optionally extracts embedded images with metadata
17
+ //! - **Office metadata**: Extracts core properties, custom properties (when `office` feature enabled)
18
+ //! - **Structure preservation**: Maintains heading hierarchy and list structure
19
+ //!
20
+ //! # Supported Formats
21
+ //!
22
+ //! - `.pptx` - PowerPoint Presentation
23
+ //! - `.pptm` - PowerPoint Macro-Enabled Presentation
24
+ //! - `.ppsx` - PowerPoint Slide Show
25
+ //!
26
+ //! # Example
27
+ //!
28
+ //! ```rust
29
+ //! use kreuzberg::extraction::pptx::extract_pptx_from_path;
30
+ //!
31
+ //! # fn example() -> kreuzberg::Result<()> {
32
+ //! let result = extract_pptx_from_path("presentation.pptx", true, None)?;
33
+ //!
34
+ //! println!("Slide count: {}", result.slide_count);
35
+ //! println!("Image count: {}", result.image_count);
36
+ //! println!("Content:\n{}", result.content);
37
+ //! # Ok(())
38
+ //! # }
39
+ //! ```
40
+
41
+ mod container;
42
+ mod content_builder;
43
+ mod elements;
44
+ mod image_handling;
45
+ mod metadata;
46
+ mod parser;
47
+
48
+ use crate::error::Result;
49
+ use crate::types::{ExtractedImage, PptxExtractionResult};
50
+
51
+ use container::{PptxContainer, SlideIterator};
52
+ use content_builder::ContentBuilder;
53
+ use elements::{ParserConfig, SlideElement};
54
+ use image_handling::detect_image_format;
55
+ use metadata::{extract_all_notes, extract_metadata};
56
+
57
+ /// Extract PPTX content from a file path.
58
+ ///
59
+ /// # Arguments
60
+ ///
61
+ /// * `path` - Path to the PPTX file
62
+ /// * `extract_images` - Whether to extract embedded images
63
+ /// * `page_config` - Optional page configuration for boundary tracking
64
+ ///
65
+ /// # Returns
66
+ ///
67
+ /// A `PptxExtractionResult` containing extracted content, metadata, and images.
68
+ pub fn extract_pptx_from_path(
69
+ path: &str,
70
+ extract_images: bool,
71
+ page_config: Option<&crate::core::config::PageConfig>,
72
+ ) -> Result<PptxExtractionResult> {
73
+ let config = ParserConfig {
74
+ extract_images,
75
+ ..Default::default()
76
+ };
77
+
78
+ let mut container = PptxContainer::open(path)?;
79
+
80
+ let metadata = extract_metadata(&mut container.archive);
81
+
82
+ let notes = extract_all_notes(&mut container)?;
83
+
84
+ let mut iterator = SlideIterator::new(container);
85
+ let slide_count = iterator.slide_count();
86
+
87
+ let estimated_capacity = slide_count.saturating_mul(1000).max(8192);
88
+ let mut content_builder = ContentBuilder::with_page_config(estimated_capacity, page_config.cloned());
89
+
90
+ let mut total_image_count = 0;
91
+ let mut total_table_count = 0;
92
+ let mut extracted_images = Vec::new();
93
+
94
+ while let Some(slide) = iterator.next_slide()? {
95
+ let byte_start = if page_config.is_some() {
96
+ content_builder.start_slide(slide.slide_number)
97
+ } else {
98
+ 0
99
+ };
100
+
101
+ let slide_content = slide.to_markdown(&config);
102
+ content_builder.add_text(&slide_content);
103
+
104
+ if let Some(slide_notes) = notes.get(&slide.slide_number) {
105
+ content_builder.add_notes(slide_notes);
106
+ }
107
+
108
+ if page_config.is_some() {
109
+ content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
110
+ }
111
+
112
+ if config.extract_images
113
+ && let Ok(image_data) = iterator.get_slide_images(&slide)
114
+ {
115
+ for (_, data) in image_data {
116
+ let format = detect_image_format(&data);
117
+ let image_index = extracted_images.len();
118
+
119
+ extracted_images.push(ExtractedImage {
120
+ data,
121
+ format,
122
+ image_index,
123
+ page_number: Some(slide.slide_number as usize),
124
+ width: None,
125
+ height: None,
126
+ colorspace: None,
127
+ bits_per_component: None,
128
+ is_mask: false,
129
+ description: None,
130
+ ocr_result: None,
131
+ });
132
+ }
133
+ }
134
+
135
+ total_image_count += slide.image_count();
136
+ total_table_count += slide.table_count();
137
+ }
138
+
139
+ let (content, boundaries, page_contents) = content_builder.build();
140
+
141
+ let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
142
+ total_count: slide_count,
143
+ unit_type: crate::types::PageUnitType::Slide,
144
+ boundaries: Some(bounds.clone()),
145
+ pages: page_contents.as_ref().map(|pcs| {
146
+ pcs.iter()
147
+ .map(|pc| crate::types::PageInfo {
148
+ number: pc.page_number,
149
+ title: None,
150
+ dimensions: None,
151
+ image_count: None,
152
+ table_count: None,
153
+ hidden: None,
154
+ })
155
+ .collect()
156
+ }),
157
+ });
158
+
159
+ Ok(PptxExtractionResult {
160
+ content,
161
+ metadata,
162
+ slide_count,
163
+ image_count: total_image_count,
164
+ table_count: total_table_count,
165
+ images: extracted_images,
166
+ page_structure,
167
+ page_contents,
168
+ })
169
+ }
170
+
171
+ /// Extract PPTX content from a byte buffer.
172
+ ///
173
+ /// # Arguments
174
+ ///
175
+ /// * `data` - Raw PPTX file bytes
176
+ /// * `extract_images` - Whether to extract embedded images
177
+ /// * `page_config` - Optional page configuration for boundary tracking
178
+ ///
179
+ /// # Returns
180
+ ///
181
+ /// A `PptxExtractionResult` containing extracted content, metadata, and images.
182
+ pub fn extract_pptx_from_bytes(
183
+ data: &[u8],
184
+ extract_images: bool,
185
+ page_config: Option<&crate::core::config::PageConfig>,
186
+ ) -> Result<PptxExtractionResult> {
187
+ use std::sync::atomic::{AtomicU64, Ordering};
188
+ static COUNTER: AtomicU64 = AtomicU64::new(0);
189
+ let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
190
+ let temp_path = std::env::temp_dir().join(format!("temp_pptx_{}_{}.pptx", std::process::id(), unique_id));
191
+
192
+ // IO errors must bubble up - temp file write issues need user reports ~keep
193
+ std::fs::write(&temp_path, data)?;
194
+
195
+ let result = extract_pptx_from_path(
196
+ temp_path.to_str().ok_or_else(|| {
197
+ crate::KreuzbergError::validation("Invalid temp path - contains invalid UTF-8".to_string())
198
+ })?,
199
+ extract_images,
200
+ page_config,
201
+ );
202
+
203
+ if let Err(e) = std::fs::remove_file(&temp_path) {
204
+ tracing::warn!("Failed to remove temp PPTX file: {}", e);
205
+ }
206
+
207
+ result
208
+ }
209
+
210
+ // Re-export Slide implementation methods for internal use
211
+ impl elements::Slide {
212
+ fn from_xml(slide_number: u32, xml_data: &[u8], rels_data: Option<&[u8]>) -> Result<Self> {
213
+ let elements = parser::parse_slide_xml(xml_data)?;
214
+
215
+ let images = if let Some(rels) = rels_data {
216
+ parser::parse_slide_rels(rels)?
217
+ } else {
218
+ Vec::new()
219
+ };
220
+
221
+ Ok(Self {
222
+ slide_number,
223
+ elements,
224
+ images,
225
+ })
226
+ }
227
+
228
+ fn to_markdown(&self, config: &ParserConfig) -> String {
229
+ let mut builder = ContentBuilder::new();
230
+
231
+ if config.include_slide_comment {
232
+ builder.add_slide_header(self.slide_number);
233
+ }
234
+
235
+ let mut element_indices: Vec<usize> = (0..self.elements.len()).collect();
236
+ element_indices.sort_by_key(|&i| {
237
+ let pos = self.elements[i].position();
238
+ (pos.y, pos.x)
239
+ });
240
+
241
+ for &idx in &element_indices {
242
+ match &self.elements[idx] {
243
+ SlideElement::Text(text, _) => {
244
+ let text_content: String = text.runs.iter().map(|run| run.render_as_md()).collect();
245
+
246
+ let normalized = text_content.replace('\n', " ");
247
+ let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
248
+
249
+ if is_title {
250
+ builder.add_title(normalized.trim());
251
+ } else {
252
+ builder.add_text(&text_content);
253
+ }
254
+ }
255
+ SlideElement::Table(table, _) => {
256
+ let table_rows: Vec<Vec<String>> = table
257
+ .rows
258
+ .iter()
259
+ .map(|row| {
260
+ row.cells
261
+ .iter()
262
+ .map(|cell| cell.runs.iter().map(|run| run.extract()).collect::<String>())
263
+ .collect()
264
+ })
265
+ .collect();
266
+ builder.add_table(&table_rows);
267
+ }
268
+ SlideElement::List(list, _) => {
269
+ for item in &list.items {
270
+ let item_text: String = item.runs.iter().map(|run| run.extract()).collect();
271
+ builder.add_list_item(item.level, item.is_ordered, &item_text);
272
+ }
273
+ }
274
+ SlideElement::Image(img_ref, _) => {
275
+ builder.add_image(&img_ref.id, self.slide_number);
276
+ }
277
+ SlideElement::Unknown => {}
278
+ }
279
+ }
280
+
281
+ builder.build().0
282
+ }
283
+
284
+ fn image_count(&self) -> usize {
285
+ self.elements
286
+ .iter()
287
+ .filter(|e| matches!(e, SlideElement::Image(_, _)))
288
+ .count()
289
+ }
290
+
291
+ fn table_count(&self) -> usize {
292
+ self.elements
293
+ .iter()
294
+ .filter(|e| matches!(e, SlideElement::Table(_, _)))
295
+ .count()
296
+ }
297
+ }
298
+
299
+ #[cfg(test)]
300
+ mod tests {
301
+ use super::*;
302
+
303
+ fn create_test_pptx_bytes(slides: Vec<&str>) -> Vec<u8> {
304
+ use std::io::Write;
305
+ use zip::write::{SimpleFileOptions, ZipWriter};
306
+
307
+ let mut buffer = Vec::new();
308
+ {
309
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
310
+ let options = SimpleFileOptions::default();
311
+
312
+ zip.start_file("[Content_Types].xml", options).unwrap();
313
+ zip.write_all(
314
+ br#"<?xml version="1.0" encoding="UTF-8"?>
315
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
316
+ <Default Extension="xml" ContentType="application/xml"/>
317
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
318
+ </Types>"#,
319
+ )
320
+ .unwrap();
321
+
322
+ zip.start_file("ppt/presentation.xml", options).unwrap();
323
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
324
+
325
+ zip.start_file("_rels/.rels", options).unwrap();
326
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
327
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
328
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
329
+ </Relationships>"#).unwrap();
330
+
331
+ let mut rels_xml = String::from(
332
+ r#"<?xml version="1.0" encoding="UTF-8"?>
333
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
334
+ );
335
+ for (i, _) in slides.iter().enumerate() {
336
+ rels_xml.push_str(&format!(
337
+ r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
338
+ i + 1,
339
+ i + 1
340
+ ));
341
+ }
342
+ rels_xml.push_str("</Relationships>");
343
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
344
+ zip.write_all(rels_xml.as_bytes()).unwrap();
345
+
346
+ for (i, text) in slides.iter().enumerate() {
347
+ let slide_xml = format!(
348
+ r#"<?xml version="1.0" encoding="UTF-8"?>
349
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
350
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
351
+ <p:cSld>
352
+ <p:spTree>
353
+ <p:sp>
354
+ <p:txBody>
355
+ <a:p>
356
+ <a:r>
357
+ <a:t>{}</a:t>
358
+ </a:r>
359
+ </a:p>
360
+ </p:txBody>
361
+ </p:sp>
362
+ </p:spTree>
363
+ </p:cSld>
364
+ </p:sld>"#,
365
+ text
366
+ );
367
+ zip.start_file(format!("ppt/slides/slide{}.xml", i + 1), options)
368
+ .unwrap();
369
+ zip.write_all(slide_xml.as_bytes()).unwrap();
370
+ }
371
+
372
+ zip.start_file("docProps/core.xml", options).unwrap();
373
+ zip.write_all(
374
+ br#"<?xml version="1.0" encoding="UTF-8"?>
375
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
376
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
377
+ xmlns:dcterms="http://purl.org/dc/terms/">
378
+ <dc:title>Test Presentation</dc:title>
379
+ <dc:creator>Test Author</dc:creator>
380
+ <dc:description>Test Description</dc:description>
381
+ <dc:subject>Test Subject</dc:subject>
382
+ </cp:coreProperties>"#,
383
+ )
384
+ .unwrap();
385
+
386
+ // Add app.xml with slide count
387
+ let app_xml = format!(
388
+ r#"<?xml version="1.0" encoding="UTF-8"?>
389
+ <Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
390
+ xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">
391
+ <Slides>{}</Slides>
392
+ <Application>Microsoft Office PowerPoint</Application>
393
+ </Properties>"#,
394
+ slides.len()
395
+ );
396
+ zip.start_file("docProps/app.xml", options).unwrap();
397
+ zip.write_all(app_xml.as_bytes()).unwrap();
398
+
399
+ let _ = zip.finish().unwrap();
400
+ }
401
+ buffer
402
+ }
403
+
404
+ #[test]
405
+ fn test_extract_pptx_from_bytes_single_slide() {
406
+ let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
407
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
408
+
409
+ assert_eq!(result.slide_count, 1);
410
+ assert!(
411
+ result.content.contains("Hello World"),
412
+ "Content was: {}",
413
+ result.content
414
+ );
415
+ assert_eq!(result.image_count, 0);
416
+ assert_eq!(result.table_count, 0);
417
+ }
418
+
419
+ #[test]
420
+ fn test_extract_pptx_from_bytes_multiple_slides() {
421
+ let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
422
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
423
+
424
+ assert_eq!(result.slide_count, 3);
425
+ assert!(result.content.contains("Slide 1"));
426
+ assert!(result.content.contains("Slide 2"));
427
+ assert!(result.content.contains("Slide 3"));
428
+ }
429
+
430
+ #[test]
431
+ fn test_extract_pptx_metadata() {
432
+ let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
433
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
434
+
435
+ // Metadata should be populated (slide_count should be 1 for the test content)
436
+ assert_eq!(result.metadata.slide_count, 1);
437
+ }
438
+
439
+ #[test]
440
+ fn test_extract_pptx_empty_slides() {
441
+ let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
442
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
443
+
444
+ assert_eq!(result.slide_count, 3);
445
+ }
446
+
447
+ #[test]
448
+ fn test_extract_pptx_from_bytes_invalid_data() {
449
+ use crate::error::KreuzbergError;
450
+
451
+ let invalid_bytes = b"not a valid pptx file";
452
+ let result = extract_pptx_from_bytes(invalid_bytes, false, None);
453
+
454
+ assert!(result.is_err());
455
+ if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
456
+ assert!(msg.contains("Failed to read PPTX archive") || msg.contains("Failed to write temp PPTX file"));
457
+ } else {
458
+ panic!("Expected ParsingError");
459
+ }
460
+ }
461
+
462
+ #[test]
463
+ fn test_extract_pptx_from_bytes_empty_data() {
464
+ let empty_bytes: &[u8] = &[];
465
+ let result = extract_pptx_from_bytes(empty_bytes, false, None);
466
+
467
+ assert!(result.is_err());
468
+ }
469
+
470
+ #[test]
471
+ fn test_detect_image_format_jpeg() {
472
+ let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
473
+ assert_eq!(detect_image_format(&jpeg_header), "jpeg");
474
+ }
475
+
476
+ #[test]
477
+ fn test_detect_image_format_png() {
478
+ let png_header = vec![0x89, 0x50, 0x4E, 0x47];
479
+ assert_eq!(detect_image_format(&png_header), "png");
480
+ }
481
+
482
+ #[test]
483
+ fn test_detect_image_format_gif() {
484
+ let gif_header = b"GIF89a";
485
+ assert_eq!(detect_image_format(gif_header), "gif");
486
+ }
487
+
488
+ #[test]
489
+ fn test_detect_image_format_bmp() {
490
+ let bmp_header = b"BM";
491
+ assert_eq!(detect_image_format(bmp_header), "bmp");
492
+ }
493
+
494
+ #[test]
495
+ fn test_detect_image_format_svg() {
496
+ let svg_header = b"<svg xmlns=\"http://www.w3.org/2000/svg\">";
497
+ assert_eq!(detect_image_format(svg_header), "svg");
498
+ }
499
+
500
+ #[test]
501
+ fn test_detect_image_format_tiff_little_endian() {
502
+ let tiff_header = vec![0x49, 0x49, 0x2A, 0x00];
503
+ assert_eq!(detect_image_format(&tiff_header), "tiff");
504
+ }
505
+
506
+ #[test]
507
+ fn test_detect_image_format_tiff_big_endian() {
508
+ let tiff_header = vec![0x4D, 0x4D, 0x00, 0x2A];
509
+ assert_eq!(detect_image_format(&tiff_header), "tiff");
510
+ }
511
+
512
+ #[test]
513
+ fn test_detect_image_format_unknown() {
514
+ let unknown_data = b"unknown format";
515
+ assert_eq!(detect_image_format(unknown_data), "unknown");
516
+ }
517
+
518
+ #[test]
519
+ fn test_html_escape() {
520
+ assert_eq!(image_handling::html_escape("plain text"), "plain text");
521
+ assert_eq!(image_handling::html_escape("a & b"), "a &amp; b");
522
+ assert_eq!(image_handling::html_escape("<tag>"), "&lt;tag&gt;");
523
+ assert_eq!(image_handling::html_escape("\"quoted\""), "&quot;quoted&quot;");
524
+ assert_eq!(image_handling::html_escape("'apostrophe'"), "&#x27;apostrophe&#x27;");
525
+ assert_eq!(
526
+ image_handling::html_escape("<a href=\"url\" title='test'>text & more</a>"),
527
+ "&lt;a href=&quot;url&quot; title=&#x27;test&#x27;&gt;text &amp; more&lt;/a&gt;"
528
+ );
529
+ }
530
+
531
+ #[test]
532
+ fn test_get_slide_rels_path() {
533
+ assert_eq!(
534
+ image_handling::get_slide_rels_path("ppt/slides/slide1.xml"),
535
+ "ppt/slides/_rels/slide1.xml.rels"
536
+ );
537
+ assert_eq!(
538
+ image_handling::get_slide_rels_path("ppt/slides/slide10.xml"),
539
+ "ppt/slides/_rels/slide10.xml.rels"
540
+ );
541
+ }
542
+
543
+ #[test]
544
+ fn test_get_full_image_path_relative() {
545
+ assert_eq!(
546
+ image_handling::get_full_image_path("ppt/slides/slide1.xml", "../media/image1.png"),
547
+ "ppt/media/image1.png"
548
+ );
549
+ }
550
+
551
+ #[test]
552
+ fn test_get_full_image_path_direct() {
553
+ assert_eq!(
554
+ image_handling::get_full_image_path("ppt/slides/slide1.xml", "image1.png"),
555
+ "ppt/slides/image1.png"
556
+ );
557
+ }
558
+ }