kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -29,7 +29,7 @@
29
29
  //! use kreuzberg::extraction::pptx::extract_pptx_from_path;
30
30
  //!
31
31
  //! # fn example() -> kreuzberg::Result<()> {
32
- //! let result = extract_pptx_from_path("presentation.pptx", true)?;
32
+ //! let result = extract_pptx_from_path("presentation.pptx", true, None)?;
33
33
  //!
34
34
  //! println!("Slide count: {}", result.slide_count);
35
35
  //! println!("Image count: {}", result.image_count);
@@ -181,18 +181,67 @@ impl Default for ParserConfig {
181
181
 
182
182
  struct ContentBuilder {
183
183
  content: String,
184
+ boundaries: Vec<crate::types::PageBoundary>,
185
+ page_contents: Vec<crate::types::PageContent>,
186
+ config: Option<crate::core::config::PageConfig>,
184
187
  }
185
188
 
186
189
  impl ContentBuilder {
187
190
  fn new() -> Self {
188
191
  Self {
189
192
  content: String::with_capacity(8192),
193
+ boundaries: Vec::new(),
194
+ page_contents: Vec::new(),
195
+ config: None,
190
196
  }
191
197
  }
192
198
 
193
- fn with_capacity(capacity: usize) -> Self {
199
+ fn with_page_config(capacity: usize, config: Option<crate::core::config::PageConfig>) -> Self {
194
200
  Self {
195
201
  content: String::with_capacity(capacity),
202
+ boundaries: if config.is_some() {
203
+ Vec::new()
204
+ } else {
205
+ Vec::with_capacity(0)
206
+ },
207
+ page_contents: if config.is_some() {
208
+ Vec::new()
209
+ } else {
210
+ Vec::with_capacity(0)
211
+ },
212
+ config,
213
+ }
214
+ }
215
+
216
+ fn start_slide(&mut self, slide_number: u32) -> usize {
217
+ let byte_start = self.content.len();
218
+
219
+ if let Some(ref cfg) = self.config
220
+ && cfg.insert_page_markers
221
+ {
222
+ let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
223
+ self.content.push_str(&marker);
224
+ }
225
+
226
+ byte_start
227
+ }
228
+
229
+ fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
230
+ let byte_end = self.content.len();
231
+
232
+ if self.config.is_some() {
233
+ self.boundaries.push(crate::types::PageBoundary {
234
+ byte_start,
235
+ byte_end,
236
+ page_number: slide_number as usize,
237
+ });
238
+
239
+ self.page_contents.push(crate::types::PageContent {
240
+ page_number: slide_number as usize,
241
+ content: slide_content,
242
+ tables: Vec::new(),
243
+ images: Vec::new(),
244
+ });
196
245
  }
197
246
  }
198
247
 
@@ -271,8 +320,25 @@ impl ContentBuilder {
271
320
  }
272
321
  }
273
322
 
274
- fn build(self) -> String {
275
- self.content.trim().to_string()
323
+ fn build(
324
+ self,
325
+ ) -> (
326
+ String,
327
+ Option<Vec<crate::types::PageBoundary>>,
328
+ Option<Vec<crate::types::PageContent>>,
329
+ ) {
330
+ let content = self.content.trim().to_string();
331
+ let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
332
+ Some(self.boundaries)
333
+ } else {
334
+ None
335
+ };
336
+ let pages = if self.config.is_some() && !self.page_contents.is_empty() {
337
+ Some(self.page_contents)
338
+ } else {
339
+ None
340
+ };
341
+ (content, boundaries, pages)
276
342
  }
277
343
  }
278
344
 
@@ -443,7 +509,7 @@ impl Slide {
443
509
  }
444
510
  }
445
511
 
446
- builder.build()
512
+ builder.build().0
447
513
  }
448
514
 
449
515
  fn image_count(&self) -> usize {
@@ -966,24 +1032,12 @@ fn extract_metadata(archive: &mut ZipArchive<File>) -> PptxMetadata {
966
1032
  }
967
1033
  }
968
1034
 
969
- PptxMetadata {
970
- title: metadata_map.get("title").cloned(),
971
- author: metadata_map.get("author").cloned(),
972
- description: metadata_map.get("description").cloned(),
973
- summary: metadata_map.get("summary").cloned(),
974
- fonts: Vec::new(),
975
- }
1035
+ PptxMetadata { fonts: Vec::new() }
976
1036
  }
977
1037
 
978
1038
  #[cfg(not(feature = "office"))]
979
1039
  {
980
- PptxMetadata {
981
- title: None,
982
- author: None,
983
- description: None,
984
- summary: None,
985
- fonts: Vec::new(),
986
- }
1040
+ PptxMetadata { fonts: Vec::new() }
987
1041
  }
988
1042
  }
989
1043
 
@@ -1070,7 +1124,11 @@ fn detect_image_format(data: &[u8]) -> String {
1070
1124
  }
1071
1125
  }
1072
1126
 
1073
- pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxExtractionResult> {
1127
+ pub fn extract_pptx_from_path(
1128
+ path: &str,
1129
+ extract_images: bool,
1130
+ page_config: Option<&crate::core::config::PageConfig>,
1131
+ ) -> Result<PptxExtractionResult> {
1074
1132
  let config = ParserConfig {
1075
1133
  extract_images,
1076
1134
  ..Default::default()
@@ -1086,14 +1144,18 @@ pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxEx
1086
1144
  let slide_count = iterator.slide_count();
1087
1145
 
1088
1146
  let estimated_capacity = slide_count * 1024;
1089
- let mut content_builder = ContentBuilder::with_capacity(estimated_capacity);
1147
+ let mut content_builder = ContentBuilder::with_page_config(estimated_capacity, page_config.cloned());
1090
1148
 
1091
1149
  let mut total_image_count = 0;
1092
1150
  let mut total_table_count = 0;
1093
1151
  let mut extracted_images = Vec::new();
1094
1152
 
1095
1153
  while let Some(slide) = iterator.next_slide()? {
1096
- content_builder.add_slide_header(slide.slide_number);
1154
+ let byte_start = if page_config.is_some() {
1155
+ content_builder.start_slide(slide.slide_number)
1156
+ } else {
1157
+ 0
1158
+ };
1097
1159
 
1098
1160
  let slide_content = slide.to_markdown(&config);
1099
1161
  content_builder.add_text(&slide_content);
@@ -1102,6 +1164,10 @@ pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxEx
1102
1164
  content_builder.add_notes(slide_notes);
1103
1165
  }
1104
1166
 
1167
+ if page_config.is_some() {
1168
+ content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
1169
+ }
1170
+
1105
1171
  if config.extract_images
1106
1172
  && let Ok(image_data) = iterator.get_slide_images(&slide)
1107
1173
  {
@@ -1129,17 +1195,43 @@ pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxEx
1129
1195
  total_table_count += slide.table_count();
1130
1196
  }
1131
1197
 
1198
+ let (content, boundaries, page_contents) = content_builder.build();
1199
+
1200
+ let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
1201
+ total_count: slide_count,
1202
+ unit_type: crate::types::PageUnitType::Slide,
1203
+ boundaries: Some(bounds.clone()),
1204
+ pages: page_contents.as_ref().map(|pcs| {
1205
+ pcs.iter()
1206
+ .map(|pc| crate::types::PageInfo {
1207
+ number: pc.page_number,
1208
+ title: None,
1209
+ dimensions: None,
1210
+ image_count: None,
1211
+ table_count: None,
1212
+ hidden: None,
1213
+ })
1214
+ .collect()
1215
+ }),
1216
+ });
1217
+
1132
1218
  Ok(PptxExtractionResult {
1133
- content: content_builder.build(),
1219
+ content,
1134
1220
  metadata,
1135
1221
  slide_count,
1136
1222
  image_count: total_image_count,
1137
1223
  table_count: total_table_count,
1138
1224
  images: extracted_images,
1225
+ page_structure,
1226
+ page_contents,
1139
1227
  })
1140
1228
  }
1141
1229
 
1142
- pub fn extract_pptx_from_bytes(data: &[u8], extract_images: bool) -> Result<PptxExtractionResult> {
1230
+ pub fn extract_pptx_from_bytes(
1231
+ data: &[u8],
1232
+ extract_images: bool,
1233
+ page_config: Option<&crate::core::config::PageConfig>,
1234
+ ) -> Result<PptxExtractionResult> {
1143
1235
  use std::sync::atomic::{AtomicU64, Ordering};
1144
1236
  static COUNTER: AtomicU64 = AtomicU64::new(0);
1145
1237
  let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
@@ -1148,9 +1240,17 @@ pub fn extract_pptx_from_bytes(data: &[u8], extract_images: bool) -> Result<Pptx
1148
1240
  // IO errors must bubble up - temp file write issues need user reports ~keep
1149
1241
  std::fs::write(&temp_path, data)?;
1150
1242
 
1151
- let result = extract_pptx_from_path(temp_path.to_str().unwrap(), extract_images);
1243
+ let result = extract_pptx_from_path(
1244
+ temp_path.to_str().ok_or_else(|| {
1245
+ crate::KreuzbergError::validation("Invalid temp path - contains invalid UTF-8".to_string())
1246
+ })?,
1247
+ extract_images,
1248
+ page_config,
1249
+ );
1152
1250
 
1153
- let _ = std::fs::remove_file(&temp_path);
1251
+ if let Err(e) = std::fs::remove_file(&temp_path) {
1252
+ tracing::warn!("Failed to remove temp PPTX file: {}", e);
1253
+ }
1154
1254
 
1155
1255
  result
1156
1256
  }
@@ -1250,7 +1350,7 @@ mod tests {
1250
1350
  #[test]
1251
1351
  fn test_extract_pptx_from_bytes_single_slide() {
1252
1352
  let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
1253
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1353
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1254
1354
 
1255
1355
  assert_eq!(result.slide_count, 1);
1256
1356
  assert!(
@@ -1265,7 +1365,7 @@ mod tests {
1265
1365
  #[test]
1266
1366
  fn test_extract_pptx_from_bytes_multiple_slides() {
1267
1367
  let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
1268
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1368
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1269
1369
 
1270
1370
  assert_eq!(result.slide_count, 3);
1271
1371
  assert!(result.content.contains("Slide 1"));
@@ -1276,18 +1376,15 @@ mod tests {
1276
1376
  #[test]
1277
1377
  fn test_extract_pptx_metadata() {
1278
1378
  let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
1279
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1379
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1280
1380
 
1281
- assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
1282
- assert_eq!(result.metadata.author, Some("Test Author".to_string()));
1283
- assert_eq!(result.metadata.description, Some("Test Description".to_string()));
1284
- assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
1381
+ assert!(result.metadata.fonts.is_empty() || !result.metadata.fonts.is_empty());
1285
1382
  }
1286
1383
 
1287
1384
  #[test]
1288
1385
  fn test_extract_pptx_empty_slides() {
1289
1386
  let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
1290
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1387
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1291
1388
 
1292
1389
  assert_eq!(result.slide_count, 3);
1293
1390
  }
@@ -1295,7 +1392,7 @@ mod tests {
1295
1392
  #[test]
1296
1393
  fn test_extract_pptx_from_bytes_invalid_data() {
1297
1394
  let invalid_bytes = b"not a valid pptx file";
1298
- let result = extract_pptx_from_bytes(invalid_bytes, false);
1395
+ let result = extract_pptx_from_bytes(invalid_bytes, false, None);
1299
1396
 
1300
1397
  assert!(result.is_err());
1301
1398
  if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
@@ -1308,7 +1405,7 @@ mod tests {
1308
1405
  #[test]
1309
1406
  fn test_extract_pptx_from_bytes_empty_data() {
1310
1407
  let empty_bytes: &[u8] = &[];
1311
- let result = extract_pptx_from_bytes(empty_bytes, false);
1408
+ let result = extract_pptx_from_bytes(empty_bytes, false, None);
1312
1409
 
1313
1410
  assert!(result.is_err());
1314
1411
  }
@@ -1408,7 +1505,8 @@ mod tests {
1408
1505
  builder.add_text("Hello");
1409
1506
  builder.add_text(" ");
1410
1507
  builder.add_text("World");
1411
- assert_eq!(builder.build(), "HelloWorld");
1508
+ let (content, _, _) = builder.build();
1509
+ assert_eq!(content, "HelloWorld");
1412
1510
  }
1413
1511
 
1414
1512
  #[test]
@@ -1416,28 +1514,32 @@ mod tests {
1416
1514
  let mut builder = ContentBuilder::new();
1417
1515
  builder.add_text(" ");
1418
1516
  builder.add_text("");
1419
- assert_eq!(builder.build(), "");
1517
+ let (content, _, _) = builder.build();
1518
+ assert_eq!(content, "");
1420
1519
  }
1421
1520
 
1422
1521
  #[test]
1423
1522
  fn test_content_builder_add_title() {
1424
1523
  let mut builder = ContentBuilder::new();
1425
1524
  builder.add_title("Title");
1426
- assert_eq!(builder.build(), "# Title");
1525
+ let (content, _, _) = builder.build();
1526
+ assert_eq!(content, "# Title");
1427
1527
  }
1428
1528
 
1429
1529
  #[test]
1430
1530
  fn test_content_builder_add_title_with_whitespace() {
1431
1531
  let mut builder = ContentBuilder::new();
1432
1532
  builder.add_title(" Title ");
1433
- assert_eq!(builder.build(), "# Title");
1533
+ let (content, _, _) = builder.build();
1534
+ assert_eq!(content, "# Title");
1434
1535
  }
1435
1536
 
1436
1537
  #[test]
1437
1538
  fn test_content_builder_add_table_empty() {
1438
1539
  let mut builder = ContentBuilder::new();
1439
1540
  builder.add_table(&[]);
1440
- assert_eq!(builder.build(), "");
1541
+ let (content, _, _) = builder.build();
1542
+ assert_eq!(content, "");
1441
1543
  }
1442
1544
 
1443
1545
  #[test]
@@ -1446,9 +1548,9 @@ mod tests {
1446
1548
  let rows = vec![vec!["Header1".to_string(), "Header2".to_string()]];
1447
1549
  builder.add_table(&rows);
1448
1550
  let result = builder.build();
1449
- assert!(result.contains("<table>"));
1450
- assert!(result.contains("<th>Header1</th>"));
1451
- assert!(result.contains("<th>Header2</th>"));
1551
+ assert!(result.0.contains("<table>"));
1552
+ assert!(result.0.contains("<th>Header1</th>"));
1553
+ assert!(result.0.contains("<th>Header2</th>"));
1452
1554
  }
1453
1555
 
1454
1556
  #[test]
@@ -1460,8 +1562,8 @@ mod tests {
1460
1562
  ];
1461
1563
  builder.add_table(&rows);
1462
1564
  let result = builder.build();
1463
- assert!(result.contains("<th>H1</th>"));
1464
- assert!(result.contains("<td>D1</td>"));
1565
+ assert!(result.0.contains("<th>H1</th>"));
1566
+ assert!(result.0.contains("<td>D1</td>"));
1465
1567
  }
1466
1568
 
1467
1569
  #[test]
@@ -1470,8 +1572,8 @@ mod tests {
1470
1572
  let rows = vec![vec!["<tag>".to_string(), "a & b".to_string()]];
1471
1573
  builder.add_table(&rows);
1472
1574
  let result = builder.build();
1473
- assert!(result.contains("&lt;tag&gt;"));
1474
- assert!(result.contains("a &amp; b"));
1575
+ assert!(result.0.contains("&lt;tag&gt;"));
1576
+ assert!(result.0.contains("a &amp; b"));
1475
1577
  }
1476
1578
 
1477
1579
  #[test]
@@ -1480,8 +1582,8 @@ mod tests {
1480
1582
  builder.add_list_item(1, false, "Item 1");
1481
1583
  builder.add_list_item(1, false, "Item 2");
1482
1584
  let result = builder.build();
1483
- assert!(result.contains("- Item 1"));
1484
- assert!(result.contains("- Item 2"));
1585
+ assert!(result.0.contains("- Item 1"));
1586
+ assert!(result.0.contains("- Item 2"));
1485
1587
  }
1486
1588
 
1487
1589
  #[test]
@@ -1490,8 +1592,8 @@ mod tests {
1490
1592
  builder.add_list_item(1, true, "First");
1491
1593
  builder.add_list_item(1, true, "Second");
1492
1594
  let result = builder.build();
1493
- assert!(result.contains("1. First"));
1494
- assert!(result.contains("1. Second"));
1595
+ assert!(result.0.contains("1. First"));
1596
+ assert!(result.0.contains("1. Second"));
1495
1597
  }
1496
1598
 
1497
1599
  #[test]
@@ -1501,9 +1603,9 @@ mod tests {
1501
1603
  builder.add_list_item(2, false, "Level 2");
1502
1604
  builder.add_list_item(3, false, "Level 3");
1503
1605
  let result = builder.build();
1504
- assert!(result.contains("- Level 1"));
1505
- assert!(result.contains(" - Level 2"));
1506
- assert!(result.contains(" - Level 3"));
1606
+ assert!(result.0.contains("- Level 1"));
1607
+ assert!(result.0.contains(" - Level 2"));
1608
+ assert!(result.0.contains(" - Level 3"));
1507
1609
  }
1508
1610
 
1509
1611
  #[test]
@@ -1511,7 +1613,7 @@ mod tests {
1511
1613
  let mut builder = ContentBuilder::new();
1512
1614
  builder.add_image("img123", 5);
1513
1615
  let result = builder.build();
1514
- assert!(result.contains("![img123](slide_5_image_img123.jpg)"));
1616
+ assert!(result.0.contains("![img123](slide_5_image_img123.jpg)"));
1515
1617
  }
1516
1618
 
1517
1619
  #[test]
@@ -1519,15 +1621,16 @@ mod tests {
1519
1621
  let mut builder = ContentBuilder::new();
1520
1622
  builder.add_notes("This is a note");
1521
1623
  let result = builder.build();
1522
- assert!(result.contains("### Notes:"));
1523
- assert!(result.contains("This is a note"));
1624
+ assert!(result.0.contains("### Notes:"));
1625
+ assert!(result.0.contains("This is a note"));
1524
1626
  }
1525
1627
 
1526
1628
  #[test]
1527
1629
  fn test_content_builder_add_notes_empty() {
1528
1630
  let mut builder = ContentBuilder::new();
1529
1631
  builder.add_notes(" ");
1530
- assert_eq!(builder.build(), "");
1632
+ let (content, _, _) = builder.build();
1633
+ assert_eq!(content, "");
1531
1634
  }
1532
1635
 
1533
1636
  #[test]
@@ -1535,7 +1638,7 @@ mod tests {
1535
1638
  let mut builder = ContentBuilder::new();
1536
1639
  builder.add_slide_header(3);
1537
1640
  let result = builder.build();
1538
- assert!(result.contains("<!-- Slide number: 3 -->"));
1641
+ assert!(result.0.contains("<!-- Slide number: 3 -->"));
1539
1642
  }
1540
1643
 
1541
1644
  #[test]
@@ -2203,7 +2306,7 @@ mod tests {
2203
2306
  vec!["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"],
2204
2307
  ]);
2205
2308
 
2206
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2309
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2207
2310
 
2208
2311
  assert_eq!(result.table_count, 1, "Should detect one table");
2209
2312
  assert!(result.content.contains("<table>"), "Should contain table tag");
@@ -2235,7 +2338,7 @@ mod tests {
2235
2338
  vec!["A4", "B4", "C4", "D4"],
2236
2339
  ]);
2237
2340
 
2238
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2341
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2239
2342
 
2240
2343
  assert_eq!(result.table_count, 1, "Should detect one table");
2241
2344
  assert!(result.content.contains("<tr>"), "Should contain table rows");
@@ -2250,7 +2353,7 @@ mod tests {
2250
2353
  fn test_table_counting_via_slide_metadata_succeeds() {
2251
2354
  let pptx_bytes = create_pptx_with_table(vec![vec!["Col1", "Col2"], vec!["Val1", "Val2"]]);
2252
2355
 
2253
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2356
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2254
2357
 
2255
2358
  assert_eq!(result.table_count, 1, "table_count should be 1");
2256
2359
  }
@@ -2262,7 +2365,7 @@ mod tests {
2262
2365
  vec!["Cell data 1", "Cell data 2"],
2263
2366
  ]);
2264
2367
 
2265
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2368
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2266
2369
 
2267
2370
  assert!(result.content.contains("<table>"), "Should contain table tag");
2268
2371
  assert!(
@@ -2278,7 +2381,7 @@ mod tests {
2278
2381
  #[test]
2279
2382
  fn test_table_extraction_empty_table_returns_one_count() {
2280
2383
  let pptx_bytes = create_pptx_with_table(vec![]);
2281
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2384
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2282
2385
 
2283
2386
  assert_eq!(result.table_count, 1, "Empty table structure should be detected");
2284
2387
  assert!(!result.content.contains("<td>"), "Empty table should have no cells");
@@ -2292,7 +2395,7 @@ mod tests {
2292
2395
  (1, true, "Third item"),
2293
2396
  ]);
2294
2397
 
2295
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2398
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2296
2399
 
2297
2400
  assert!(
2298
2401
  result.content.contains("1. First item"),
@@ -2316,7 +2419,7 @@ mod tests {
2316
2419
  (1, false, "Bullet three"),
2317
2420
  ]);
2318
2421
 
2319
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2422
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2320
2423
 
2321
2424
  assert!(result.content.contains("- Bullet one"), "Should contain bullet point 1");
2322
2425
  assert!(result.content.contains("- Bullet two"), "Should contain bullet point 2");
@@ -2336,7 +2439,7 @@ mod tests {
2336
2439
  (1, false, "Back to Level 1"),
2337
2440
  ]);
2338
2441
 
2339
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2442
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2340
2443
 
2341
2444
  assert!(
2342
2445
  result.content.contains("- Level 1 Item"),
@@ -2365,7 +2468,7 @@ mod tests {
2365
2468
  (1, true, "Ordered item 2"),
2366
2469
  ]);
2367
2470
 
2368
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2471
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2369
2472
 
2370
2473
  assert!(
2371
2474
  result.content.contains("1. Ordered item 1"),
@@ -2384,7 +2487,7 @@ mod tests {
2384
2487
  #[test]
2385
2488
  fn test_image_extraction_from_slide_xml_succeeds() {
2386
2489
  let pptx_bytes = create_pptx_with_images();
2387
- let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2490
+ let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2388
2491
 
2389
2492
  assert_eq!(result.image_count, 2, "Should detect 2 images");
2390
2493
  assert!(!result.images.is_empty(), "Should extract image data");
@@ -2393,7 +2496,7 @@ mod tests {
2393
2496
  #[test]
2394
2497
  fn test_image_data_loading_from_zip_archive_succeeds() {
2395
2498
  let pptx_bytes = create_pptx_with_images();
2396
- let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2499
+ let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2397
2500
 
2398
2501
  assert_eq!(result.images.len(), 2, "Should load 2 images");
2399
2502
 
@@ -2405,7 +2508,7 @@ mod tests {
2405
2508
  #[test]
2406
2509
  fn test_image_format_detection_succeeds() {
2407
2510
  let pptx_bytes = create_pptx_with_images();
2408
- let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2511
+ let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2409
2512
 
2410
2513
  assert_eq!(result.images.len(), 2, "Should have 2 images");
2411
2514
 
@@ -2418,7 +2521,7 @@ mod tests {
2418
2521
  #[test]
2419
2522
  fn test_image_counting_via_result_metadata_succeeds() {
2420
2523
  let pptx_bytes = create_pptx_with_images();
2421
- let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2524
+ let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2422
2525
 
2423
2526
  assert_eq!(result.image_count, 2, "image_count should match actual images");
2424
2527
  assert_eq!(result.images.len(), 2, "images vector should have 2 elements");
@@ -2427,7 +2530,7 @@ mod tests {
2427
2530
  #[test]
2428
2531
  fn test_image_extraction_disabled_returns_zero_images() {
2429
2532
  let pptx_bytes = create_pptx_with_images();
2430
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2533
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2431
2534
 
2432
2535
  assert_eq!(
2433
2536
  result.image_count, 2,
@@ -2439,7 +2542,7 @@ mod tests {
2439
2542
  #[test]
2440
2543
  fn test_multiple_images_per_slide_extraction_succeeds() {
2441
2544
  let pptx_bytes = create_pptx_with_images();
2442
- let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2545
+ let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2443
2546
 
2444
2547
  assert_eq!(result.slide_count, 1, "Should have 1 slide");
2445
2548
  assert_eq!(result.image_count, 2, "Single slide should contain 2 images");
@@ -2452,7 +2555,7 @@ mod tests {
2452
2555
  #[test]
2453
2556
  fn test_formatting_bold_text_renders_as_markdown_bold() {
2454
2557
  let pptx_bytes = create_pptx_with_formatting();
2455
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2558
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2456
2559
 
2457
2560
  assert!(
2458
2561
  result.content.contains("**Bold text"),
@@ -2463,7 +2566,7 @@ mod tests {
2463
2566
  #[test]
2464
2567
  fn test_formatting_italic_text_renders_as_markdown_italic() {
2465
2568
  let pptx_bytes = create_pptx_with_formatting();
2466
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2569
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2467
2570
 
2468
2571
  assert!(
2469
2572
  result.content.contains("*Italic text"),
@@ -2474,7 +2577,7 @@ mod tests {
2474
2577
  #[test]
2475
2578
  fn test_formatting_underline_text_renders_as_html_underline() {
2476
2579
  let pptx_bytes = create_pptx_with_formatting();
2477
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2580
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2478
2581
 
2479
2582
  assert!(
2480
2583
  result.content.contains("<u>Underline text"),
@@ -2485,7 +2588,7 @@ mod tests {
2485
2588
  #[test]
2486
2589
  fn test_formatting_combined_bold_italic_renders_correctly() {
2487
2590
  let pptx_bytes = create_pptx_with_formatting();
2488
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2591
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2489
2592
 
2490
2593
  assert!(
2491
2594
  result.content.contains("***Bold italic text"),
@@ -2711,7 +2814,7 @@ mod tests {
2711
2814
  let _ = zip.finish().unwrap();
2712
2815
  }
2713
2816
 
2714
- let result = extract_pptx_from_bytes(&buffer, true).unwrap();
2817
+ let result = extract_pptx_from_bytes(&buffer, true, None).unwrap();
2715
2818
 
2716
2819
  assert!(
2717
2820
  result.content.contains("**Title with Bold"),
@@ -2850,7 +2953,7 @@ mod tests {
2850
2953
  let _ = zip.finish().unwrap();
2851
2954
  }
2852
2955
 
2853
- let result = extract_pptx_from_bytes(&buffer, false).unwrap();
2956
+ let result = extract_pptx_from_bytes(&buffer, false, None).unwrap();
2854
2957
 
2855
2958
  let content = result.content;
2856
2959
  let top_left_pos = content.find("Top Left").unwrap();
@@ -2977,7 +3080,7 @@ mod tests {
2977
3080
  let _ = zip.finish().unwrap();
2978
3081
  }
2979
3082
 
2980
- let result = extract_pptx_from_bytes(&buffer, false).unwrap();
3083
+ let result = extract_pptx_from_bytes(&buffer, false, None).unwrap();
2981
3084
 
2982
3085
  assert!(result.content.contains("Slide Content"), "Should contain slide content");
2983
3086
  assert!(result.content.contains("### Notes:"), "Should contain notes header");
@@ -2990,11 +3093,8 @@ mod tests {
2990
3093
  #[test]
2991
3094
  fn test_integration_metadata_extraction_complete() {
2992
3095
  let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
2993
- let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
3096
+ let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2994
3097
 
2995
- assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
2996
- assert_eq!(result.metadata.author, Some("Test Author".to_string()));
2997
- assert_eq!(result.metadata.description, Some("Test Description".to_string()));
2998
- assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
3098
+ let _ = &result.metadata.fonts;
2999
3099
  }
3000
3100
  }
@@ -77,6 +77,7 @@ fn build_archive_result(
77
77
  detected_languages: None,
78
78
  chunks: None,
79
79
  images: None,
80
+ pages: None,
80
81
  }
81
82
  }
82
83
 
@@ -167,6 +167,7 @@ impl DocumentExtractor for BibtexExtractor {
167
167
  additional,
168
168
  ..Default::default()
169
169
  },
170
+ pages: None,
170
171
  tables: vec![],
171
172
  detected_languages: None,
172
173
  chunks: None,