kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,558 @@
1
+ //! PowerPoint presentation extraction functions.
2
+ //!
3
+ //! This module provides PowerPoint (PPTX) file parsing by directly reading the Office Open XML
4
+ //! format. It extracts text content, slide structure, images, and presentation metadata.
5
+ //!
6
+ //! # Attribution
7
+ //!
8
+ //! This code is based on the [pptx-to-md](https://github.com/nilskruthoff/pptx-parser) library
9
+ //! by Nils Kruthoff, licensed under MIT OR Apache-2.0. The original code has been vendored and
10
+ //! adapted to integrate with Kreuzberg's architecture. See ATTRIBUTIONS.md for full license text.
11
+ //!
12
+ //! # Features
13
+ //!
14
+ //! - **Slide extraction**: Reads all slides from presentation
15
+ //! - **Text formatting**: Preserves bold, italic, underline formatting as Markdown
16
+ //! - **Image extraction**: Optionally extracts embedded images with metadata
17
+ //! - **Office metadata**: Extracts core properties, custom properties (when `office` feature enabled)
18
+ //! - **Structure preservation**: Maintains heading hierarchy and list structure
19
+ //!
20
+ //! # Supported Formats
21
+ //!
22
+ //! - `.pptx` - PowerPoint Presentation
23
+ //! - `.pptm` - PowerPoint Macro-Enabled Presentation
24
+ //! - `.ppsx` - PowerPoint Slide Show
25
+ //!
26
+ //! # Example
27
+ //!
28
+ //! ```rust
29
+ //! use kreuzberg::extraction::pptx::extract_pptx_from_path;
30
+ //!
31
+ //! # fn example() -> kreuzberg::Result<()> {
32
+ //! let result = extract_pptx_from_path("presentation.pptx", true, None)?;
33
+ //!
34
+ //! println!("Slide count: {}", result.slide_count);
35
+ //! println!("Image count: {}", result.image_count);
36
+ //! println!("Content:\n{}", result.content);
37
+ //! # Ok(())
38
+ //! # }
39
+ //! ```
40
+
41
+ mod container;
42
+ mod content_builder;
43
+ mod elements;
44
+ mod image_handling;
45
+ mod metadata;
46
+ mod parser;
47
+
48
+ use crate::error::Result;
49
+ use crate::types::{ExtractedImage, PptxExtractionResult};
50
+
51
+ use container::{PptxContainer, SlideIterator};
52
+ use content_builder::ContentBuilder;
53
+ use elements::{ParserConfig, SlideElement};
54
+ use image_handling::detect_image_format;
55
+ use metadata::{extract_all_notes, extract_metadata};
56
+
57
+ /// Extract PPTX content from a file path.
58
+ ///
59
+ /// # Arguments
60
+ ///
61
+ /// * `path` - Path to the PPTX file
62
+ /// * `extract_images` - Whether to extract embedded images
63
+ /// * `page_config` - Optional page configuration for boundary tracking
64
+ ///
65
+ /// # Returns
66
+ ///
67
+ /// A `PptxExtractionResult` containing extracted content, metadata, and images.
68
+ pub fn extract_pptx_from_path(
69
+ path: &str,
70
+ extract_images: bool,
71
+ page_config: Option<&crate::core::config::PageConfig>,
72
+ ) -> Result<PptxExtractionResult> {
73
+ let config = ParserConfig {
74
+ extract_images,
75
+ ..Default::default()
76
+ };
77
+
78
+ let mut container = PptxContainer::open(path)?;
79
+
80
+ let metadata = extract_metadata(&mut container.archive);
81
+
82
+ let notes = extract_all_notes(&mut container)?;
83
+
84
+ let mut iterator = SlideIterator::new(container);
85
+ let slide_count = iterator.slide_count();
86
+
87
+ let estimated_capacity = slide_count.saturating_mul(1000).max(8192);
88
+ let mut content_builder = ContentBuilder::with_page_config(estimated_capacity, page_config.cloned());
89
+
90
+ let mut total_image_count = 0;
91
+ let mut total_table_count = 0;
92
+ let mut extracted_images = Vec::new();
93
+
94
+ while let Some(slide) = iterator.next_slide()? {
95
+ let byte_start = if page_config.is_some() {
96
+ content_builder.start_slide(slide.slide_number)
97
+ } else {
98
+ 0
99
+ };
100
+
101
+ let slide_content = slide.to_markdown(&config);
102
+ content_builder.add_text(&slide_content);
103
+
104
+ if let Some(slide_notes) = notes.get(&slide.slide_number) {
105
+ content_builder.add_notes(slide_notes);
106
+ }
107
+
108
+ if page_config.is_some() {
109
+ content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
110
+ }
111
+
112
+ if config.extract_images
113
+ && let Ok(image_data) = iterator.get_slide_images(&slide)
114
+ {
115
+ for (_, data) in image_data {
116
+ let format = detect_image_format(&data);
117
+ let image_index = extracted_images.len();
118
+
119
+ extracted_images.push(ExtractedImage {
120
+ data,
121
+ format,
122
+ image_index,
123
+ page_number: Some(slide.slide_number as usize),
124
+ width: None,
125
+ height: None,
126
+ colorspace: None,
127
+ bits_per_component: None,
128
+ is_mask: false,
129
+ description: None,
130
+ ocr_result: None,
131
+ });
132
+ }
133
+ }
134
+
135
+ total_image_count += slide.image_count();
136
+ total_table_count += slide.table_count();
137
+ }
138
+
139
+ let (content, boundaries, page_contents) = content_builder.build();
140
+
141
+ let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
142
+ total_count: slide_count,
143
+ unit_type: crate::types::PageUnitType::Slide,
144
+ boundaries: Some(bounds.clone()),
145
+ pages: page_contents.as_ref().map(|pcs| {
146
+ pcs.iter()
147
+ .map(|pc| crate::types::PageInfo {
148
+ number: pc.page_number,
149
+ title: None,
150
+ dimensions: None,
151
+ image_count: None,
152
+ table_count: None,
153
+ hidden: None,
154
+ })
155
+ .collect()
156
+ }),
157
+ });
158
+
159
+ Ok(PptxExtractionResult {
160
+ content,
161
+ metadata,
162
+ slide_count,
163
+ image_count: total_image_count,
164
+ table_count: total_table_count,
165
+ images: extracted_images,
166
+ page_structure,
167
+ page_contents,
168
+ })
169
+ }
170
+
171
+ /// Extract PPTX content from a byte buffer.
172
+ ///
173
+ /// # Arguments
174
+ ///
175
+ /// * `data` - Raw PPTX file bytes
176
+ /// * `extract_images` - Whether to extract embedded images
177
+ /// * `page_config` - Optional page configuration for boundary tracking
178
+ ///
179
+ /// # Returns
180
+ ///
181
+ /// A `PptxExtractionResult` containing extracted content, metadata, and images.
182
+ pub fn extract_pptx_from_bytes(
183
+ data: &[u8],
184
+ extract_images: bool,
185
+ page_config: Option<&crate::core::config::PageConfig>,
186
+ ) -> Result<PptxExtractionResult> {
187
+ use std::sync::atomic::{AtomicU64, Ordering};
188
+ static COUNTER: AtomicU64 = AtomicU64::new(0);
189
+ let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
190
+ let temp_path = std::env::temp_dir().join(format!("temp_pptx_{}_{}.pptx", std::process::id(), unique_id));
191
+
192
+ // IO errors must bubble up - temp file write issues need user reports ~keep
193
+ std::fs::write(&temp_path, data)?;
194
+
195
+ let result = extract_pptx_from_path(
196
+ temp_path.to_str().ok_or_else(|| {
197
+ crate::KreuzbergError::validation("Invalid temp path - contains invalid UTF-8".to_string())
198
+ })?,
199
+ extract_images,
200
+ page_config,
201
+ );
202
+
203
+ if let Err(e) = std::fs::remove_file(&temp_path) {
204
+ tracing::warn!("Failed to remove temp PPTX file: {}", e);
205
+ }
206
+
207
+ result
208
+ }
209
+
210
+ // Re-export Slide implementation methods for internal use
211
+ impl elements::Slide {
212
+ fn from_xml(slide_number: u32, xml_data: &[u8], rels_data: Option<&[u8]>) -> Result<Self> {
213
+ let elements = parser::parse_slide_xml(xml_data)?;
214
+
215
+ let images = if let Some(rels) = rels_data {
216
+ parser::parse_slide_rels(rels)?
217
+ } else {
218
+ Vec::new()
219
+ };
220
+
221
+ Ok(Self {
222
+ slide_number,
223
+ elements,
224
+ images,
225
+ })
226
+ }
227
+
228
+ fn to_markdown(&self, config: &ParserConfig) -> String {
229
+ let mut builder = ContentBuilder::new();
230
+
231
+ if config.include_slide_comment {
232
+ builder.add_slide_header(self.slide_number);
233
+ }
234
+
235
+ let mut element_indices: Vec<usize> = (0..self.elements.len()).collect();
236
+ element_indices.sort_by_key(|&i| {
237
+ let pos = self.elements[i].position();
238
+ (pos.y, pos.x)
239
+ });
240
+
241
+ for &idx in &element_indices {
242
+ match &self.elements[idx] {
243
+ SlideElement::Text(text, _) => {
244
+ let text_content: String = text.runs.iter().map(|run| run.render_as_md()).collect();
245
+
246
+ let normalized = text_content.replace('\n', " ");
247
+ let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
248
+
249
+ if is_title {
250
+ builder.add_title(normalized.trim());
251
+ } else {
252
+ builder.add_text(&text_content);
253
+ }
254
+ }
255
+ SlideElement::Table(table, _) => {
256
+ let table_rows: Vec<Vec<String>> = table
257
+ .rows
258
+ .iter()
259
+ .map(|row| {
260
+ row.cells
261
+ .iter()
262
+ .map(|cell| cell.runs.iter().map(|run| run.extract()).collect::<String>())
263
+ .collect()
264
+ })
265
+ .collect();
266
+ builder.add_table(&table_rows);
267
+ }
268
+ SlideElement::List(list, _) => {
269
+ for item in &list.items {
270
+ let item_text: String = item.runs.iter().map(|run| run.extract()).collect();
271
+ builder.add_list_item(item.level, item.is_ordered, &item_text);
272
+ }
273
+ }
274
+ SlideElement::Image(img_ref, _) => {
275
+ builder.add_image(&img_ref.id, self.slide_number);
276
+ }
277
+ SlideElement::Unknown => {}
278
+ }
279
+ }
280
+
281
+ builder.build().0
282
+ }
283
+
284
+ fn image_count(&self) -> usize {
285
+ self.elements
286
+ .iter()
287
+ .filter(|e| matches!(e, SlideElement::Image(_, _)))
288
+ .count()
289
+ }
290
+
291
+ fn table_count(&self) -> usize {
292
+ self.elements
293
+ .iter()
294
+ .filter(|e| matches!(e, SlideElement::Table(_, _)))
295
+ .count()
296
+ }
297
+ }
298
+
299
+ #[cfg(test)]
300
+ mod tests {
301
+ use super::*;
302
+
303
+ fn create_test_pptx_bytes(slides: Vec<&str>) -> Vec<u8> {
304
+ use std::io::Write;
305
+ use zip::write::{SimpleFileOptions, ZipWriter};
306
+
307
+ let mut buffer = Vec::new();
308
+ {
309
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
310
+ let options = SimpleFileOptions::default();
311
+
312
+ zip.start_file("[Content_Types].xml", options).unwrap();
313
+ zip.write_all(
314
+ br#"<?xml version="1.0" encoding="UTF-8"?>
315
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
316
+ <Default Extension="xml" ContentType="application/xml"/>
317
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
318
+ </Types>"#,
319
+ )
320
+ .unwrap();
321
+
322
+ zip.start_file("ppt/presentation.xml", options).unwrap();
323
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
324
+
325
+ zip.start_file("_rels/.rels", options).unwrap();
326
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
327
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
328
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
329
+ </Relationships>"#).unwrap();
330
+
331
+ let mut rels_xml = String::from(
332
+ r#"<?xml version="1.0" encoding="UTF-8"?>
333
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
334
+ );
335
+ for (i, _) in slides.iter().enumerate() {
336
+ rels_xml.push_str(&format!(
337
+ r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
338
+ i + 1,
339
+ i + 1
340
+ ));
341
+ }
342
+ rels_xml.push_str("</Relationships>");
343
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
344
+ zip.write_all(rels_xml.as_bytes()).unwrap();
345
+
346
+ for (i, text) in slides.iter().enumerate() {
347
+ let slide_xml = format!(
348
+ r#"<?xml version="1.0" encoding="UTF-8"?>
349
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
350
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
351
+ <p:cSld>
352
+ <p:spTree>
353
+ <p:sp>
354
+ <p:txBody>
355
+ <a:p>
356
+ <a:r>
357
+ <a:t>{}</a:t>
358
+ </a:r>
359
+ </a:p>
360
+ </p:txBody>
361
+ </p:sp>
362
+ </p:spTree>
363
+ </p:cSld>
364
+ </p:sld>"#,
365
+ text
366
+ );
367
+ zip.start_file(format!("ppt/slides/slide{}.xml", i + 1), options)
368
+ .unwrap();
369
+ zip.write_all(slide_xml.as_bytes()).unwrap();
370
+ }
371
+
372
+ zip.start_file("docProps/core.xml", options).unwrap();
373
+ zip.write_all(
374
+ br#"<?xml version="1.0" encoding="UTF-8"?>
375
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
376
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
377
+ xmlns:dcterms="http://purl.org/dc/terms/">
378
+ <dc:title>Test Presentation</dc:title>
379
+ <dc:creator>Test Author</dc:creator>
380
+ <dc:description>Test Description</dc:description>
381
+ <dc:subject>Test Subject</dc:subject>
382
+ </cp:coreProperties>"#,
383
+ )
384
+ .unwrap();
385
+
386
+ // Add app.xml with slide count
387
+ let app_xml = format!(
388
+ r#"<?xml version="1.0" encoding="UTF-8"?>
389
+ <Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
390
+ xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">
391
+ <Slides>{}</Slides>
392
+ <Application>Microsoft Office PowerPoint</Application>
393
+ </Properties>"#,
394
+ slides.len()
395
+ );
396
+ zip.start_file("docProps/app.xml", options).unwrap();
397
+ zip.write_all(app_xml.as_bytes()).unwrap();
398
+
399
+ let _ = zip.finish().unwrap();
400
+ }
401
+ buffer
402
+ }
403
+
404
+ #[test]
405
+ fn test_extract_pptx_from_bytes_single_slide() {
406
+ let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
407
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
408
+
409
+ assert_eq!(result.slide_count, 1);
410
+ assert!(
411
+ result.content.contains("Hello World"),
412
+ "Content was: {}",
413
+ result.content
414
+ );
415
+ assert_eq!(result.image_count, 0);
416
+ assert_eq!(result.table_count, 0);
417
+ }
418
+
419
+ #[test]
420
+ fn test_extract_pptx_from_bytes_multiple_slides() {
421
+ let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
422
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
423
+
424
+ assert_eq!(result.slide_count, 3);
425
+ assert!(result.content.contains("Slide 1"));
426
+ assert!(result.content.contains("Slide 2"));
427
+ assert!(result.content.contains("Slide 3"));
428
+ }
429
+
430
+ #[test]
431
+ fn test_extract_pptx_metadata() {
432
+ let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
433
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
434
+
435
+ // Metadata should be populated (slide_count should be 1 for the test content)
436
+ assert_eq!(result.metadata.slide_count, 1);
437
+ }
438
+
439
+ #[test]
440
+ fn test_extract_pptx_empty_slides() {
441
+ let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
442
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
443
+
444
+ assert_eq!(result.slide_count, 3);
445
+ }
446
+
447
+ #[test]
448
+ fn test_extract_pptx_from_bytes_invalid_data() {
449
+ use crate::error::KreuzbergError;
450
+
451
+ let invalid_bytes = b"not a valid pptx file";
452
+ let result = extract_pptx_from_bytes(invalid_bytes, false, None);
453
+
454
+ assert!(result.is_err());
455
+ if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
456
+ assert!(msg.contains("Failed to read PPTX archive") || msg.contains("Failed to write temp PPTX file"));
457
+ } else {
458
+ panic!("Expected ParsingError");
459
+ }
460
+ }
461
+
462
+ #[test]
463
+ fn test_extract_pptx_from_bytes_empty_data() {
464
+ let empty_bytes: &[u8] = &[];
465
+ let result = extract_pptx_from_bytes(empty_bytes, false, None);
466
+
467
+ assert!(result.is_err());
468
+ }
469
+
470
+ #[test]
471
+ fn test_detect_image_format_jpeg() {
472
+ let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
473
+ assert_eq!(detect_image_format(&jpeg_header), "jpeg");
474
+ }
475
+
476
+ #[test]
477
+ fn test_detect_image_format_png() {
478
+ let png_header = vec![0x89, 0x50, 0x4E, 0x47];
479
+ assert_eq!(detect_image_format(&png_header), "png");
480
+ }
481
+
482
+ #[test]
483
+ fn test_detect_image_format_gif() {
484
+ let gif_header = b"GIF89a";
485
+ assert_eq!(detect_image_format(gif_header), "gif");
486
+ }
487
+
488
+ #[test]
489
+ fn test_detect_image_format_bmp() {
490
+ let bmp_header = b"BM";
491
+ assert_eq!(detect_image_format(bmp_header), "bmp");
492
+ }
493
+
494
+ #[test]
495
+ fn test_detect_image_format_svg() {
496
+ let svg_header = b"<svg xmlns=\"http://www.w3.org/2000/svg\">";
497
+ assert_eq!(detect_image_format(svg_header), "svg");
498
+ }
499
+
500
+ #[test]
501
+ fn test_detect_image_format_tiff_little_endian() {
502
+ let tiff_header = vec![0x49, 0x49, 0x2A, 0x00];
503
+ assert_eq!(detect_image_format(&tiff_header), "tiff");
504
+ }
505
+
506
+ #[test]
507
+ fn test_detect_image_format_tiff_big_endian() {
508
+ let tiff_header = vec![0x4D, 0x4D, 0x00, 0x2A];
509
+ assert_eq!(detect_image_format(&tiff_header), "tiff");
510
+ }
511
+
512
+ #[test]
513
+ fn test_detect_image_format_unknown() {
514
+ let unknown_data = b"unknown format";
515
+ assert_eq!(detect_image_format(unknown_data), "unknown");
516
+ }
517
+
518
+ #[test]
519
+ fn test_html_escape() {
520
+ assert_eq!(image_handling::html_escape("plain text"), "plain text");
521
+ assert_eq!(image_handling::html_escape("a & b"), "a &amp; b");
522
+ assert_eq!(image_handling::html_escape("<tag>"), "&lt;tag&gt;");
523
+ assert_eq!(image_handling::html_escape("\"quoted\""), "&quot;quoted&quot;");
524
+ assert_eq!(image_handling::html_escape("'apostrophe'"), "&#x27;apostrophe&#x27;");
525
+ assert_eq!(
526
+ image_handling::html_escape("<a href=\"url\" title='test'>text & more</a>"),
527
+ "&lt;a href=&quot;url&quot; title=&#x27;test&#x27;&gt;text &amp; more&lt;/a&gt;"
528
+ );
529
+ }
530
+
531
+ #[test]
532
+ fn test_get_slide_rels_path() {
533
+ assert_eq!(
534
+ image_handling::get_slide_rels_path("ppt/slides/slide1.xml"),
535
+ "ppt/slides/_rels/slide1.xml.rels"
536
+ );
537
+ assert_eq!(
538
+ image_handling::get_slide_rels_path("ppt/slides/slide10.xml"),
539
+ "ppt/slides/_rels/slide10.xml.rels"
540
+ );
541
+ }
542
+
543
+ #[test]
544
+ fn test_get_full_image_path_relative() {
545
+ assert_eq!(
546
+ image_handling::get_full_image_path("ppt/slides/slide1.xml", "../media/image1.png"),
547
+ "ppt/media/image1.png"
548
+ );
549
+ }
550
+
551
+ #[test]
552
+ fn test_get_full_image_path_direct() {
553
+ assert_eq!(
554
+ image_handling::get_full_image_path("ppt/slides/slide1.xml", "image1.png"),
555
+ "ppt/slides/image1.png"
556
+ );
557
+ }
558
+ }