kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -29,7 +29,7 @@
29
29
  //! use kreuzberg::extraction::pptx::extract_pptx_from_path;
30
30
  //!
31
31
  //! # fn example() -> kreuzberg::Result<()> {
32
- //! let result = extract_pptx_from_path("presentation.pptx", true, None)?;
32
+ //! let result = extract_pptx_from_path("presentation.pptx", true)?;
33
33
  //!
34
34
  //! println!("Slide count: {}", result.slide_count);
35
35
  //! println!("Image count: {}", result.image_count);
@@ -38,7 +38,6 @@
38
38
  //! # }
39
39
  //! ```
40
40
  use crate::error::{KreuzbergError, Result};
41
- use crate::text::utf8_validation;
42
41
  use crate::types::{ExtractedImage, PptxExtractionResult, PptxMetadata};
43
42
  use std::collections::HashMap;
44
43
  use std::fs::File;
@@ -182,68 +181,18 @@ impl Default for ParserConfig {
182
181
 
183
182
  struct ContentBuilder {
184
183
  content: String,
185
- boundaries: Vec<crate::types::PageBoundary>,
186
- page_contents: Vec<crate::types::PageContent>,
187
- config: Option<crate::core::config::PageConfig>,
188
184
  }
189
185
 
190
186
  impl ContentBuilder {
191
187
  fn new() -> Self {
192
188
  Self {
193
189
  content: String::with_capacity(8192),
194
- boundaries: Vec::new(),
195
- page_contents: Vec::new(),
196
- config: None,
197
190
  }
198
191
  }
199
192
 
200
- fn with_page_config(capacity: usize, config: Option<crate::core::config::PageConfig>) -> Self {
193
+ fn with_capacity(capacity: usize) -> Self {
201
194
  Self {
202
195
  content: String::with_capacity(capacity),
203
- boundaries: if config.is_some() {
204
- Vec::new()
205
- } else {
206
- Vec::with_capacity(0)
207
- },
208
- page_contents: if config.is_some() {
209
- Vec::new()
210
- } else {
211
- Vec::with_capacity(0)
212
- },
213
- config,
214
- }
215
- }
216
-
217
- fn start_slide(&mut self, slide_number: u32) -> usize {
218
- let byte_start = self.content.len();
219
-
220
- if let Some(ref cfg) = self.config
221
- && cfg.insert_page_markers
222
- {
223
- let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
224
- self.content.push_str(&marker);
225
- }
226
-
227
- byte_start
228
- }
229
-
230
- fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
231
- let byte_end = self.content.len();
232
-
233
- if self.config.is_some() {
234
- self.boundaries.push(crate::types::PageBoundary {
235
- byte_start,
236
- byte_end,
237
- page_number: slide_number as usize,
238
- });
239
-
240
- self.page_contents.push(crate::types::PageContent {
241
- page_number: slide_number as usize,
242
- content: slide_content,
243
- tables: Vec::new(),
244
- images: Vec::new(),
245
- hierarchy: None,
246
- });
247
196
  }
248
197
  }
249
198
 
@@ -322,25 +271,8 @@ impl ContentBuilder {
322
271
  }
323
272
  }
324
273
 
325
- fn build(
326
- self,
327
- ) -> (
328
- String,
329
- Option<Vec<crate::types::PageBoundary>>,
330
- Option<Vec<crate::types::PageContent>>,
331
- ) {
332
- let content = self.content.trim().to_string();
333
- let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
334
- Some(self.boundaries)
335
- } else {
336
- None
337
- };
338
- let pages = if self.config.is_some() && !self.page_contents.is_empty() {
339
- Some(self.page_contents)
340
- } else {
341
- None
342
- };
343
- (content, boundaries, pages)
274
+ fn build(self) -> String {
275
+ self.content.trim().to_string()
344
276
  }
345
277
  }
346
278
 
@@ -511,7 +443,7 @@ impl Slide {
511
443
  }
512
444
  }
513
445
 
514
- builder.build().0
446
+ builder.build()
515
447
  }
516
448
 
517
449
  fn image_count(&self) -> usize {
@@ -593,8 +525,8 @@ enum ParsedContent {
593
525
  }
594
526
 
595
527
  fn parse_slide_xml(xml_data: &[u8]) -> Result<Vec<SlideElement>> {
596
- let xml_str = utf8_validation::from_utf8(xml_data)
597
- .map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
528
+ let xml_str =
529
+ std::str::from_utf8(xml_data).map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
598
530
 
599
531
  let doc =
600
532
  Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse slide XML: {}", e)))?;
@@ -900,7 +832,7 @@ fn extract_position(node: &Node) -> ElementPosition {
900
832
  }
901
833
 
902
834
  fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
903
- let xml_str = utf8_validation::from_utf8(rels_data)
835
+ let xml_str = std::str::from_utf8(rels_data)
904
836
  .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in rels XML: {}", e)))?;
905
837
 
906
838
  let doc =
@@ -925,7 +857,7 @@ fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
925
857
  }
926
858
 
927
859
  fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
928
- let xml_str = utf8_validation::from_utf8(rels_data)
860
+ let xml_str = std::str::from_utf8(rels_data)
929
861
  .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in presentation rels: {}", e)))?;
930
862
 
931
863
  let doc = Document::parse(xml_str)
@@ -1034,12 +966,24 @@ fn extract_metadata(archive: &mut ZipArchive<File>) -> PptxMetadata {
1034
966
  }
1035
967
  }
1036
968
 
1037
- PptxMetadata { fonts: Vec::new() }
969
+ PptxMetadata {
970
+ title: metadata_map.get("title").cloned(),
971
+ author: metadata_map.get("author").cloned(),
972
+ description: metadata_map.get("description").cloned(),
973
+ summary: metadata_map.get("summary").cloned(),
974
+ fonts: Vec::new(),
975
+ }
1038
976
  }
1039
977
 
1040
978
  #[cfg(not(feature = "office"))]
1041
979
  {
1042
- PptxMetadata { fonts: Vec::new() }
980
+ PptxMetadata {
981
+ title: None,
982
+ author: None,
983
+ description: None,
984
+ summary: None,
985
+ fonts: Vec::new(),
986
+ }
1043
987
  }
1044
988
  }
1045
989
 
@@ -1061,13 +1005,13 @@ fn extract_all_notes(container: &mut PptxContainer) -> Result<HashMap<u32, Strin
1061
1005
  }
1062
1006
 
1063
1007
  fn extract_notes_text(notes_xml: &[u8]) -> Result<String> {
1064
- let xml_str = utf8_validation::from_utf8(notes_xml)
1008
+ let xml_str = std::str::from_utf8(notes_xml)
1065
1009
  .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in notes XML: {}", e)))?;
1066
1010
 
1067
1011
  let doc =
1068
1012
  Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse notes XML: {}", e)))?;
1069
1013
 
1070
- let mut text_parts = Vec::with_capacity(16);
1014
+ let mut text_parts = Vec::new();
1071
1015
  const DRAWINGML_NS: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
1072
1016
 
1073
1017
  for node in doc.descendants() {
@@ -1126,11 +1070,7 @@ fn detect_image_format(data: &[u8]) -> String {
1126
1070
  }
1127
1071
  }
1128
1072
 
1129
- pub fn extract_pptx_from_path(
1130
- path: &str,
1131
- extract_images: bool,
1132
- page_config: Option<&crate::core::config::PageConfig>,
1133
- ) -> Result<PptxExtractionResult> {
1073
+ pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxExtractionResult> {
1134
1074
  let config = ParserConfig {
1135
1075
  extract_images,
1136
1076
  ..Default::default()
@@ -1145,19 +1085,15 @@ pub fn extract_pptx_from_path(
1145
1085
  let mut iterator = SlideIterator::new(container);
1146
1086
  let slide_count = iterator.slide_count();
1147
1087
 
1148
- let estimated_capacity = slide_count.saturating_mul(1000).max(8192);
1149
- let mut content_builder = ContentBuilder::with_page_config(estimated_capacity, page_config.cloned());
1088
+ let estimated_capacity = slide_count * 1024;
1089
+ let mut content_builder = ContentBuilder::with_capacity(estimated_capacity);
1150
1090
 
1151
1091
  let mut total_image_count = 0;
1152
1092
  let mut total_table_count = 0;
1153
1093
  let mut extracted_images = Vec::new();
1154
1094
 
1155
1095
  while let Some(slide) = iterator.next_slide()? {
1156
- let byte_start = if page_config.is_some() {
1157
- content_builder.start_slide(slide.slide_number)
1158
- } else {
1159
- 0
1160
- };
1096
+ content_builder.add_slide_header(slide.slide_number);
1161
1097
 
1162
1098
  let slide_content = slide.to_markdown(&config);
1163
1099
  content_builder.add_text(&slide_content);
@@ -1166,10 +1102,6 @@ pub fn extract_pptx_from_path(
1166
1102
  content_builder.add_notes(slide_notes);
1167
1103
  }
1168
1104
 
1169
- if page_config.is_some() {
1170
- content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
1171
- }
1172
-
1173
1105
  if config.extract_images
1174
1106
  && let Ok(image_data) = iterator.get_slide_images(&slide)
1175
1107
  {
@@ -1197,43 +1129,17 @@ pub fn extract_pptx_from_path(
1197
1129
  total_table_count += slide.table_count();
1198
1130
  }
1199
1131
 
1200
- let (content, boundaries, page_contents) = content_builder.build();
1201
-
1202
- let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
1203
- total_count: slide_count,
1204
- unit_type: crate::types::PageUnitType::Slide,
1205
- boundaries: Some(bounds.clone()),
1206
- pages: page_contents.as_ref().map(|pcs| {
1207
- pcs.iter()
1208
- .map(|pc| crate::types::PageInfo {
1209
- number: pc.page_number,
1210
- title: None,
1211
- dimensions: None,
1212
- image_count: None,
1213
- table_count: None,
1214
- hidden: None,
1215
- })
1216
- .collect()
1217
- }),
1218
- });
1219
-
1220
1132
  Ok(PptxExtractionResult {
1221
- content,
1133
+ content: content_builder.build(),
1222
1134
  metadata,
1223
1135
  slide_count,
1224
1136
  image_count: total_image_count,
1225
1137
  table_count: total_table_count,
1226
1138
  images: extracted_images,
1227
- page_structure,
1228
- page_contents,
1229
1139
  })
1230
1140
  }
1231
1141
 
1232
- pub fn extract_pptx_from_bytes(
1233
- data: &[u8],
1234
- extract_images: bool,
1235
- page_config: Option<&crate::core::config::PageConfig>,
1236
- ) -> Result<PptxExtractionResult> {
1142
+ pub fn extract_pptx_from_bytes(data: &[u8], extract_images: bool) -> Result<PptxExtractionResult> {
1237
1143
  use std::sync::atomic::{AtomicU64, Ordering};
1238
1144
  static COUNTER: AtomicU64 = AtomicU64::new(0);
1239
1145
  let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
@@ -1242,17 +1148,9 @@ pub fn extract_pptx_from_bytes(
1242
1148
  // IO errors must bubble up - temp file write issues need user reports ~keep
1243
1149
  std::fs::write(&temp_path, data)?;
1244
1150
 
1245
- let result = extract_pptx_from_path(
1246
- temp_path.to_str().ok_or_else(|| {
1247
- crate::KreuzbergError::validation("Invalid temp path - contains invalid UTF-8".to_string())
1248
- })?,
1249
- extract_images,
1250
- page_config,
1251
- );
1151
+ let result = extract_pptx_from_path(temp_path.to_str().unwrap(), extract_images);
1252
1152
 
1253
- if let Err(e) = std::fs::remove_file(&temp_path) {
1254
- tracing::warn!("Failed to remove temp PPTX file: {}", e);
1255
- }
1153
+ let _ = std::fs::remove_file(&temp_path);
1256
1154
 
1257
1155
  result
1258
1156
  }
@@ -1352,7 +1250,7 @@ mod tests {
1352
1250
  #[test]
1353
1251
  fn test_extract_pptx_from_bytes_single_slide() {
1354
1252
  let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
1355
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1253
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1356
1254
 
1357
1255
  assert_eq!(result.slide_count, 1);
1358
1256
  assert!(
@@ -1367,7 +1265,7 @@ mod tests {
1367
1265
  #[test]
1368
1266
  fn test_extract_pptx_from_bytes_multiple_slides() {
1369
1267
  let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
1370
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1268
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1371
1269
 
1372
1270
  assert_eq!(result.slide_count, 3);
1373
1271
  assert!(result.content.contains("Slide 1"));
@@ -1378,15 +1276,18 @@ mod tests {
1378
1276
  #[test]
1379
1277
  fn test_extract_pptx_metadata() {
1380
1278
  let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
1381
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1279
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1382
1280
 
1383
- assert!(result.metadata.fonts.is_empty() || !result.metadata.fonts.is_empty());
1281
+ assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
1282
+ assert_eq!(result.metadata.author, Some("Test Author".to_string()));
1283
+ assert_eq!(result.metadata.description, Some("Test Description".to_string()));
1284
+ assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
1384
1285
  }
1385
1286
 
1386
1287
  #[test]
1387
1288
  fn test_extract_pptx_empty_slides() {
1388
1289
  let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
1389
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1290
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1390
1291
 
1391
1292
  assert_eq!(result.slide_count, 3);
1392
1293
  }
@@ -1394,7 +1295,7 @@ mod tests {
1394
1295
  #[test]
1395
1296
  fn test_extract_pptx_from_bytes_invalid_data() {
1396
1297
  let invalid_bytes = b"not a valid pptx file";
1397
- let result = extract_pptx_from_bytes(invalid_bytes, false, None);
1298
+ let result = extract_pptx_from_bytes(invalid_bytes, false);
1398
1299
 
1399
1300
  assert!(result.is_err());
1400
1301
  if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
@@ -1407,7 +1308,7 @@ mod tests {
1407
1308
  #[test]
1408
1309
  fn test_extract_pptx_from_bytes_empty_data() {
1409
1310
  let empty_bytes: &[u8] = &[];
1410
- let result = extract_pptx_from_bytes(empty_bytes, false, None);
1311
+ let result = extract_pptx_from_bytes(empty_bytes, false);
1411
1312
 
1412
1313
  assert!(result.is_err());
1413
1314
  }
@@ -1507,8 +1408,7 @@ mod tests {
1507
1408
  builder.add_text("Hello");
1508
1409
  builder.add_text(" ");
1509
1410
  builder.add_text("World");
1510
- let (content, _, _) = builder.build();
1511
- assert_eq!(content, "HelloWorld");
1411
+ assert_eq!(builder.build(), "HelloWorld");
1512
1412
  }
1513
1413
 
1514
1414
  #[test]
@@ -1516,32 +1416,28 @@ mod tests {
1516
1416
  let mut builder = ContentBuilder::new();
1517
1417
  builder.add_text(" ");
1518
1418
  builder.add_text("");
1519
- let (content, _, _) = builder.build();
1520
- assert_eq!(content, "");
1419
+ assert_eq!(builder.build(), "");
1521
1420
  }
1522
1421
 
1523
1422
  #[test]
1524
1423
  fn test_content_builder_add_title() {
1525
1424
  let mut builder = ContentBuilder::new();
1526
1425
  builder.add_title("Title");
1527
- let (content, _, _) = builder.build();
1528
- assert_eq!(content, "# Title");
1426
+ assert_eq!(builder.build(), "# Title");
1529
1427
  }
1530
1428
 
1531
1429
  #[test]
1532
1430
  fn test_content_builder_add_title_with_whitespace() {
1533
1431
  let mut builder = ContentBuilder::new();
1534
1432
  builder.add_title(" Title ");
1535
- let (content, _, _) = builder.build();
1536
- assert_eq!(content, "# Title");
1433
+ assert_eq!(builder.build(), "# Title");
1537
1434
  }
1538
1435
 
1539
1436
  #[test]
1540
1437
  fn test_content_builder_add_table_empty() {
1541
1438
  let mut builder = ContentBuilder::new();
1542
1439
  builder.add_table(&[]);
1543
- let (content, _, _) = builder.build();
1544
- assert_eq!(content, "");
1440
+ assert_eq!(builder.build(), "");
1545
1441
  }
1546
1442
 
1547
1443
  #[test]
@@ -1550,9 +1446,9 @@ mod tests {
1550
1446
  let rows = vec![vec!["Header1".to_string(), "Header2".to_string()]];
1551
1447
  builder.add_table(&rows);
1552
1448
  let result = builder.build();
1553
- assert!(result.0.contains("<table>"));
1554
- assert!(result.0.contains("<th>Header1</th>"));
1555
- assert!(result.0.contains("<th>Header2</th>"));
1449
+ assert!(result.contains("<table>"));
1450
+ assert!(result.contains("<th>Header1</th>"));
1451
+ assert!(result.contains("<th>Header2</th>"));
1556
1452
  }
1557
1453
 
1558
1454
  #[test]
@@ -1564,8 +1460,8 @@ mod tests {
1564
1460
  ];
1565
1461
  builder.add_table(&rows);
1566
1462
  let result = builder.build();
1567
- assert!(result.0.contains("<th>H1</th>"));
1568
- assert!(result.0.contains("<td>D1</td>"));
1463
+ assert!(result.contains("<th>H1</th>"));
1464
+ assert!(result.contains("<td>D1</td>"));
1569
1465
  }
1570
1466
 
1571
1467
  #[test]
@@ -1574,8 +1470,8 @@ mod tests {
1574
1470
  let rows = vec![vec!["<tag>".to_string(), "a & b".to_string()]];
1575
1471
  builder.add_table(&rows);
1576
1472
  let result = builder.build();
1577
- assert!(result.0.contains("&lt;tag&gt;"));
1578
- assert!(result.0.contains("a &amp; b"));
1473
+ assert!(result.contains("&lt;tag&gt;"));
1474
+ assert!(result.contains("a &amp; b"));
1579
1475
  }
1580
1476
 
1581
1477
  #[test]
@@ -1584,8 +1480,8 @@ mod tests {
1584
1480
  builder.add_list_item(1, false, "Item 1");
1585
1481
  builder.add_list_item(1, false, "Item 2");
1586
1482
  let result = builder.build();
1587
- assert!(result.0.contains("- Item 1"));
1588
- assert!(result.0.contains("- Item 2"));
1483
+ assert!(result.contains("- Item 1"));
1484
+ assert!(result.contains("- Item 2"));
1589
1485
  }
1590
1486
 
1591
1487
  #[test]
@@ -1594,8 +1490,8 @@ mod tests {
1594
1490
  builder.add_list_item(1, true, "First");
1595
1491
  builder.add_list_item(1, true, "Second");
1596
1492
  let result = builder.build();
1597
- assert!(result.0.contains("1. First"));
1598
- assert!(result.0.contains("1. Second"));
1493
+ assert!(result.contains("1. First"));
1494
+ assert!(result.contains("1. Second"));
1599
1495
  }
1600
1496
 
1601
1497
  #[test]
@@ -1605,9 +1501,9 @@ mod tests {
1605
1501
  builder.add_list_item(2, false, "Level 2");
1606
1502
  builder.add_list_item(3, false, "Level 3");
1607
1503
  let result = builder.build();
1608
- assert!(result.0.contains("- Level 1"));
1609
- assert!(result.0.contains(" - Level 2"));
1610
- assert!(result.0.contains(" - Level 3"));
1504
+ assert!(result.contains("- Level 1"));
1505
+ assert!(result.contains(" - Level 2"));
1506
+ assert!(result.contains(" - Level 3"));
1611
1507
  }
1612
1508
 
1613
1509
  #[test]
@@ -1615,7 +1511,7 @@ mod tests {
1615
1511
  let mut builder = ContentBuilder::new();
1616
1512
  builder.add_image("img123", 5);
1617
1513
  let result = builder.build();
1618
- assert!(result.0.contains("![img123](slide_5_image_img123.jpg)"));
1514
+ assert!(result.contains("![img123](slide_5_image_img123.jpg)"));
1619
1515
  }
1620
1516
 
1621
1517
  #[test]
@@ -1623,16 +1519,15 @@ mod tests {
1623
1519
  let mut builder = ContentBuilder::new();
1624
1520
  builder.add_notes("This is a note");
1625
1521
  let result = builder.build();
1626
- assert!(result.0.contains("### Notes:"));
1627
- assert!(result.0.contains("This is a note"));
1522
+ assert!(result.contains("### Notes:"));
1523
+ assert!(result.contains("This is a note"));
1628
1524
  }
1629
1525
 
1630
1526
  #[test]
1631
1527
  fn test_content_builder_add_notes_empty() {
1632
1528
  let mut builder = ContentBuilder::new();
1633
1529
  builder.add_notes(" ");
1634
- let (content, _, _) = builder.build();
1635
- assert_eq!(content, "");
1530
+ assert_eq!(builder.build(), "");
1636
1531
  }
1637
1532
 
1638
1533
  #[test]
@@ -1640,7 +1535,7 @@ mod tests {
1640
1535
  let mut builder = ContentBuilder::new();
1641
1536
  builder.add_slide_header(3);
1642
1537
  let result = builder.build();
1643
- assert!(result.0.contains("<!-- Slide number: 3 -->"));
1538
+ assert!(result.contains("<!-- Slide number: 3 -->"));
1644
1539
  }
1645
1540
 
1646
1541
  #[test]
@@ -2308,7 +2203,7 @@ mod tests {
2308
2203
  vec!["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"],
2309
2204
  ]);
2310
2205
 
2311
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2206
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2312
2207
 
2313
2208
  assert_eq!(result.table_count, 1, "Should detect one table");
2314
2209
  assert!(result.content.contains("<table>"), "Should contain table tag");
@@ -2340,7 +2235,7 @@ mod tests {
2340
2235
  vec!["A4", "B4", "C4", "D4"],
2341
2236
  ]);
2342
2237
 
2343
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2238
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2344
2239
 
2345
2240
  assert_eq!(result.table_count, 1, "Should detect one table");
2346
2241
  assert!(result.content.contains("<tr>"), "Should contain table rows");
@@ -2355,7 +2250,7 @@ mod tests {
2355
2250
  fn test_table_counting_via_slide_metadata_succeeds() {
2356
2251
  let pptx_bytes = create_pptx_with_table(vec![vec!["Col1", "Col2"], vec!["Val1", "Val2"]]);
2357
2252
 
2358
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2253
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2359
2254
 
2360
2255
  assert_eq!(result.table_count, 1, "table_count should be 1");
2361
2256
  }
@@ -2367,7 +2262,7 @@ mod tests {
2367
2262
  vec!["Cell data 1", "Cell data 2"],
2368
2263
  ]);
2369
2264
 
2370
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2265
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2371
2266
 
2372
2267
  assert!(result.content.contains("<table>"), "Should contain table tag");
2373
2268
  assert!(
@@ -2383,7 +2278,7 @@ mod tests {
2383
2278
  #[test]
2384
2279
  fn test_table_extraction_empty_table_returns_one_count() {
2385
2280
  let pptx_bytes = create_pptx_with_table(vec![]);
2386
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2281
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2387
2282
 
2388
2283
  assert_eq!(result.table_count, 1, "Empty table structure should be detected");
2389
2284
  assert!(!result.content.contains("<td>"), "Empty table should have no cells");
@@ -2397,7 +2292,7 @@ mod tests {
2397
2292
  (1, true, "Third item"),
2398
2293
  ]);
2399
2294
 
2400
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2295
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2401
2296
 
2402
2297
  assert!(
2403
2298
  result.content.contains("1. First item"),
@@ -2421,7 +2316,7 @@ mod tests {
2421
2316
  (1, false, "Bullet three"),
2422
2317
  ]);
2423
2318
 
2424
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2319
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2425
2320
 
2426
2321
  assert!(result.content.contains("- Bullet one"), "Should contain bullet point 1");
2427
2322
  assert!(result.content.contains("- Bullet two"), "Should contain bullet point 2");
@@ -2441,7 +2336,7 @@ mod tests {
2441
2336
  (1, false, "Back to Level 1"),
2442
2337
  ]);
2443
2338
 
2444
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2339
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2445
2340
 
2446
2341
  assert!(
2447
2342
  result.content.contains("- Level 1 Item"),
@@ -2470,7 +2365,7 @@ mod tests {
2470
2365
  (1, true, "Ordered item 2"),
2471
2366
  ]);
2472
2367
 
2473
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2368
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2474
2369
 
2475
2370
  assert!(
2476
2371
  result.content.contains("1. Ordered item 1"),
@@ -2489,7 +2384,7 @@ mod tests {
2489
2384
  #[test]
2490
2385
  fn test_image_extraction_from_slide_xml_succeeds() {
2491
2386
  let pptx_bytes = create_pptx_with_images();
2492
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2387
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2493
2388
 
2494
2389
  assert_eq!(result.image_count, 2, "Should detect 2 images");
2495
2390
  assert!(!result.images.is_empty(), "Should extract image data");
@@ -2498,7 +2393,7 @@ mod tests {
2498
2393
  #[test]
2499
2394
  fn test_image_data_loading_from_zip_archive_succeeds() {
2500
2395
  let pptx_bytes = create_pptx_with_images();
2501
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2396
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2502
2397
 
2503
2398
  assert_eq!(result.images.len(), 2, "Should load 2 images");
2504
2399
 
@@ -2510,7 +2405,7 @@ mod tests {
2510
2405
  #[test]
2511
2406
  fn test_image_format_detection_succeeds() {
2512
2407
  let pptx_bytes = create_pptx_with_images();
2513
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2408
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2514
2409
 
2515
2410
  assert_eq!(result.images.len(), 2, "Should have 2 images");
2516
2411
 
@@ -2523,7 +2418,7 @@ mod tests {
2523
2418
  #[test]
2524
2419
  fn test_image_counting_via_result_metadata_succeeds() {
2525
2420
  let pptx_bytes = create_pptx_with_images();
2526
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2421
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2527
2422
 
2528
2423
  assert_eq!(result.image_count, 2, "image_count should match actual images");
2529
2424
  assert_eq!(result.images.len(), 2, "images vector should have 2 elements");
@@ -2532,7 +2427,7 @@ mod tests {
2532
2427
  #[test]
2533
2428
  fn test_image_extraction_disabled_returns_zero_images() {
2534
2429
  let pptx_bytes = create_pptx_with_images();
2535
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2430
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2536
2431
 
2537
2432
  assert_eq!(
2538
2433
  result.image_count, 2,
@@ -2544,7 +2439,7 @@ mod tests {
2544
2439
  #[test]
2545
2440
  fn test_multiple_images_per_slide_extraction_succeeds() {
2546
2441
  let pptx_bytes = create_pptx_with_images();
2547
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2442
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2548
2443
 
2549
2444
  assert_eq!(result.slide_count, 1, "Should have 1 slide");
2550
2445
  assert_eq!(result.image_count, 2, "Single slide should contain 2 images");
@@ -2557,7 +2452,7 @@ mod tests {
2557
2452
  #[test]
2558
2453
  fn test_formatting_bold_text_renders_as_markdown_bold() {
2559
2454
  let pptx_bytes = create_pptx_with_formatting();
2560
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2455
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2561
2456
 
2562
2457
  assert!(
2563
2458
  result.content.contains("**Bold text"),
@@ -2568,7 +2463,7 @@ mod tests {
2568
2463
  #[test]
2569
2464
  fn test_formatting_italic_text_renders_as_markdown_italic() {
2570
2465
  let pptx_bytes = create_pptx_with_formatting();
2571
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2466
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2572
2467
 
2573
2468
  assert!(
2574
2469
  result.content.contains("*Italic text"),
@@ -2579,7 +2474,7 @@ mod tests {
2579
2474
  #[test]
2580
2475
  fn test_formatting_underline_text_renders_as_html_underline() {
2581
2476
  let pptx_bytes = create_pptx_with_formatting();
2582
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2477
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2583
2478
 
2584
2479
  assert!(
2585
2480
  result.content.contains("<u>Underline text"),
@@ -2590,7 +2485,7 @@ mod tests {
2590
2485
  #[test]
2591
2486
  fn test_formatting_combined_bold_italic_renders_correctly() {
2592
2487
  let pptx_bytes = create_pptx_with_formatting();
2593
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2488
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2594
2489
 
2595
2490
  assert!(
2596
2491
  result.content.contains("***Bold italic text"),
@@ -2816,7 +2711,7 @@ mod tests {
2816
2711
  let _ = zip.finish().unwrap();
2817
2712
  }
2818
2713
 
2819
- let result = extract_pptx_from_bytes(&buffer, true, None).unwrap();
2714
+ let result = extract_pptx_from_bytes(&buffer, true).unwrap();
2820
2715
 
2821
2716
  assert!(
2822
2717
  result.content.contains("**Title with Bold"),
@@ -2955,7 +2850,7 @@ mod tests {
2955
2850
  let _ = zip.finish().unwrap();
2956
2851
  }
2957
2852
 
2958
- let result = extract_pptx_from_bytes(&buffer, false, None).unwrap();
2853
+ let result = extract_pptx_from_bytes(&buffer, false).unwrap();
2959
2854
 
2960
2855
  let content = result.content;
2961
2856
  let top_left_pos = content.find("Top Left").unwrap();
@@ -3082,7 +2977,7 @@ mod tests {
3082
2977
  let _ = zip.finish().unwrap();
3083
2978
  }
3084
2979
 
3085
- let result = extract_pptx_from_bytes(&buffer, false, None).unwrap();
2980
+ let result = extract_pptx_from_bytes(&buffer, false).unwrap();
3086
2981
 
3087
2982
  assert!(result.content.contains("Slide Content"), "Should contain slide content");
3088
2983
  assert!(result.content.contains("### Notes:"), "Should contain notes header");
@@ -3095,8 +2990,11 @@ mod tests {
3095
2990
  #[test]
3096
2991
  fn test_integration_metadata_extraction_complete() {
3097
2992
  let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
3098
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2993
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
3099
2994
 
3100
- let _ = &result.metadata.fonts;
2995
+ assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
2996
+ assert_eq!(result.metadata.author, Some("Test Author".to_string()));
2997
+ assert_eq!(result.metadata.description, Some("Test Description".to_string()));
2998
+ assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
3101
2999
  }
3102
3000
  }