kreuzberg 4.1.2 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  5. data/kreuzberg.gemspec +13 -1
  6. data/lib/kreuzberg/config.rb +70 -35
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +5 -1
  9. data/spec/binding/batch_operations_spec.rb +80 -0
  10. data/spec/binding/metadata_types_spec.rb +77 -57
  11. data/spec/serialization_spec.rb +134 -0
  12. data/spec/unit/config/output_format_spec.rb +380 -0
  13. data/vendor/Cargo.toml +1 -1
  14. data/vendor/kreuzberg/Cargo.toml +1 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  17. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  18. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  19. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  20. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  21. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  22. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  23. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  24. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  25. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  26. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  27. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  28. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  29. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  30. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  31. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  32. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  33. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  34. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  35. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  36. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  37. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  38. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  39. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  40. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  41. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  42. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  43. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  44. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  45. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  46. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  47. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  48. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  49. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  50. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  51. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  52. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  53. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  54. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  55. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  56. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  57. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  58. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  59. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  60. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  61. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  62. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  63. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  64. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  65. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  66. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  67. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  68. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  69. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  70. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  71. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  72. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  73. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  74. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  75. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  76. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  77. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  78. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  79. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  80. metadata +10 -2
@@ -120,7 +120,7 @@ async fn test_concurrent_batch_extractions() {
120
120
  for handle in handles {
121
121
  let results = handle.await.expect("Task should not panic");
122
122
  assert!(results.is_ok(), "Batch extraction should succeed");
123
- let results = results.unwrap();
123
+ let results = results.expect("Operation failed");
124
124
  assert_eq!(results.len(), 20, "Should return all results");
125
125
  }
126
126
  }
@@ -147,7 +147,9 @@ async fn test_concurrent_extractions_with_cache() {
147
147
 
148
148
  let test_data = b"Cached content for concurrent access test";
149
149
 
150
- let _ = extract_bytes(test_data, "text/plain", &config).await.unwrap();
150
+ let _ = extract_bytes(test_data, "text/plain", &config)
151
+ .await
152
+ .expect("Async operation failed");
151
153
 
152
154
  let mut handles = vec![];
153
155
  for _ in 0..100 {
@@ -163,7 +165,7 @@ async fn test_concurrent_extractions_with_cache() {
163
165
  for handle in handles {
164
166
  let result = handle.await.expect("Task should not panic");
165
167
  assert!(result.is_ok(), "Cache read should succeed");
166
- let extraction = result.unwrap();
168
+ let extraction = result.expect("Operation failed");
167
169
  assert_text_content(&extraction.content, expected_content);
168
170
  }
169
171
  }
@@ -225,7 +227,7 @@ async fn test_concurrent_ocr_processing() {
225
227
  let mut extracted_texts = vec![];
226
228
  for result in results {
227
229
  assert!(result.is_ok(), "OCR should succeed: {:?}", result.err());
228
- let extraction = result.unwrap();
230
+ let extraction = result.expect("Operation failed");
229
231
  assert!(!extraction.content.is_empty(), "OCR should extract text");
230
232
  extracted_texts.push(extraction.content);
231
233
  }
@@ -394,7 +396,7 @@ async fn test_concurrent_pipeline_processing() {
394
396
  for handle in handles {
395
397
  let result = handle.await.expect("Task should not panic");
396
398
  assert!(result.is_ok(), "Pipeline should succeed");
397
- let processed = result.unwrap();
399
+ let processed = result.expect("Operation failed");
398
400
  assert!(processed.content.contains("[processed]"), "Processor should run");
399
401
  }
400
402
 
@@ -457,7 +459,9 @@ async fn test_extraction_throughput_scales() {
457
459
 
458
460
  let sequential_start = std::time::Instant::now();
459
461
  for _ in 0..20 {
460
- let _ = extract_bytes(test_data, "text/plain", &config).await.unwrap();
462
+ let _ = extract_bytes(test_data, "text/plain", &config)
463
+ .await
464
+ .expect("Async operation failed");
461
465
  }
462
466
  let sequential_duration = sequential_start.elapsed();
463
467
 
@@ -0,0 +1,414 @@
1
+ //! Config behavioral verification tests
2
+ //!
3
+ //! These tests verify that configuration options actually affect extraction behavior,
4
+ //! not just that they serialize correctly.
5
+ //!
6
+ //! Unlike serialization tests that only check if configs deserialize, these tests verify
7
+ //! that the configuration options actually influence the extraction process and produce
8
+ //! observable differences in the output.
9
+
10
+ use kreuzberg::core::config::ChunkingConfig;
11
+ use kreuzberg::core::config::ExtractionConfig;
12
+ use kreuzberg::core::config::OutputFormat;
13
+ use kreuzberg::core::extractor::extract_bytes;
14
+ use kreuzberg::types::OutputFormat as ResultFormat;
15
+
16
+ mod helpers;
17
+
18
+ /// Test output_format Plain produces text without formatting
19
+ ///
20
+ /// Note: HTML extractors often convert to markdown internally, so this test
21
+ /// uses plain text input to verify the output_format configuration is respected.
22
+ #[tokio::test]
23
+ async fn test_output_format_plain_produces_plain() {
24
+ let plain_text = b"Title\n\nParagraph with bold text.";
25
+
26
+ let config = ExtractionConfig {
27
+ output_format: OutputFormat::Plain,
28
+ ..Default::default()
29
+ };
30
+
31
+ let result = extract_bytes(plain_text, "text/plain", &config)
32
+ .await
33
+ .expect("Should extract successfully");
34
+
35
+ // Plain text should not have markdown or HTML formatting
36
+ assert!(
37
+ !result.content.contains("# ") && !result.content.contains("<h1>"),
38
+ "Plain format should not contain markdown headers or HTML tags, got: {}",
39
+ result.content
40
+ );
41
+ assert!(
42
+ result.content.contains("Title") || result.content.contains("Paragraph"),
43
+ "Should still contain extracted text content"
44
+ );
45
+ }
46
+
47
+ /// Test output_format Markdown produces markdown formatting
48
+ #[tokio::test]
49
+ async fn test_output_format_markdown_produces_markdown() {
50
+ let html = b"<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";
51
+
52
+ let config = ExtractionConfig {
53
+ output_format: OutputFormat::Markdown,
54
+ ..Default::default()
55
+ };
56
+
57
+ let result = extract_bytes(html, "text/html", &config)
58
+ .await
59
+ .expect("Should extract successfully");
60
+
61
+ // Verify markdown formatting is present (# for headers or ** for bold)
62
+ let has_markdown = result.content.contains("# ") || result.content.contains("**") || result.content.contains("*");
63
+
64
+ assert!(
65
+ has_markdown,
66
+ "Markdown format should contain # headers or ** bold, got: {}",
67
+ result.content
68
+ );
69
+ }
70
+
71
+ /// Test output_format HTML produces valid HTML content
72
+ #[tokio::test]
73
+ async fn test_output_format_html_produces_html() {
74
+ let text = "Title\n\nParagraph with bold text.";
75
+
76
+ let config = ExtractionConfig {
77
+ output_format: OutputFormat::Html,
78
+ ..Default::default()
79
+ };
80
+
81
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
82
+ .await
83
+ .expect("Should extract successfully");
84
+
85
+ // HTML format should be safe and not contain injection vectors
86
+ assert!(
87
+ !result.content.contains("<script>"),
88
+ "HTML format should be safe from injection"
89
+ );
90
+ assert!(!result.content.is_empty(), "Should produce content in HTML format");
91
+ }
92
+
93
+ /// Test result_format Unified produces content in single field
94
+ #[tokio::test]
95
+ async fn test_result_format_unified_structure() {
96
+ let text = "Sample content";
97
+
98
+ let config = ExtractionConfig {
99
+ result_format: ResultFormat::Unified,
100
+ ..Default::default()
101
+ };
102
+
103
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
104
+ .await
105
+ .expect("Should extract successfully");
106
+
107
+ // Unified format should have content in main content field
108
+ assert!(!result.content.is_empty(), "Unified format should have content");
109
+
110
+ // Elements should be None or empty for unified format
111
+ assert!(
112
+ result.elements.is_none() || result.elements.as_ref().unwrap().is_empty(),
113
+ "Unified format should not have elements"
114
+ );
115
+ }
116
+
117
+ /// Test result_format ElementBased produces element structure
118
+ #[tokio::test]
119
+ async fn test_result_format_element_based_structure() {
120
+ let text = "First paragraph here.\n\nSecond paragraph with more content.";
121
+
122
+ let config = ExtractionConfig {
123
+ result_format: ResultFormat::ElementBased,
124
+ ..Default::default()
125
+ };
126
+
127
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
128
+ .await
129
+ .expect("Should extract successfully");
130
+
131
+ // Element-based format should produce elements array
132
+ if let Some(elements) = &result.elements {
133
+ assert!(!elements.is_empty(), "Element-based format should have elements");
134
+ // Verify elements have expected structure
135
+ for element in elements {
136
+ assert!(!element.text.is_empty(), "Elements should have non-empty text");
137
+ }
138
+ }
139
+ }
140
+
141
+ /// Test chunking max_chars actually limits chunk size
142
+ #[tokio::test]
143
+ #[cfg(feature = "chunking")]
144
+ async fn test_chunking_max_chars_limits_chunk_size() {
145
+ let long_text = "word ".repeat(500); // ~2500 characters
146
+
147
+ let config = ExtractionConfig {
148
+ chunking: Some(ChunkingConfig {
149
+ max_chars: 100,
150
+ max_overlap: 20,
151
+ embedding: None,
152
+ preset: None,
153
+ }),
154
+ ..Default::default()
155
+ };
156
+
157
+ let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
158
+ .await
159
+ .expect("Should extract successfully");
160
+
161
+ assert!(result.chunks.is_some(), "Chunking should produce chunks");
162
+
163
+ if let Some(chunks) = result.chunks {
164
+ assert!(chunks.len() > 1, "Long text should produce multiple chunks");
165
+
166
+ // Verify chunk size constraint: each chunk should respect max_chars
167
+ for (i, chunk) in chunks.iter().enumerate() {
168
+ assert!(
169
+ chunk.content.len() <= 100 + 20,
170
+ "Chunk {} exceeds max_chars + overlap: length = {}",
171
+ i,
172
+ chunk.content.len()
173
+ );
174
+ }
175
+ }
176
+ }
177
+
178
+ /// Test chunking with overlap creates overlapping chunks
179
+ #[tokio::test]
180
+ #[cfg(feature = "chunking")]
181
+ async fn test_chunking_overlap_creates_overlap() {
182
+ let text = "First sentence. ".repeat(30); // ~480 characters
183
+
184
+ let config = ExtractionConfig {
185
+ chunking: Some(ChunkingConfig {
186
+ max_chars: 50,
187
+ max_overlap: 15,
188
+ embedding: None,
189
+ preset: None,
190
+ }),
191
+ ..Default::default()
192
+ };
193
+
194
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
195
+ .await
196
+ .expect("Should extract successfully");
197
+
198
+ if let Some(chunks) = result.chunks {
199
+ if chunks.len() >= 2 {
200
+ // Check if adjacent chunks have overlapping text
201
+ let chunk1_end = &chunks[0].content[chunks[0].content.len().saturating_sub(15)..];
202
+ let chunk2_start = &chunks[1].content[..chunks[1].content.len().min(15)];
203
+
204
+ // There should be some overlap in the text
205
+ let overlap_found = chunk1_end.chars().any(|c| c != ' ') && chunk2_start.chars().any(|c| c != ' ');
206
+
207
+ assert!(
208
+ overlap_found,
209
+ "Adjacent chunks should have overlapping non-whitespace text"
210
+ );
211
+ }
212
+ }
213
+ }
214
+
215
+ /// Test chunking disabled produces no chunks
216
+ #[tokio::test]
217
+ async fn test_chunking_disabled_produces_no_chunks() {
218
+ let long_text = "word ".repeat(500);
219
+
220
+ let config = ExtractionConfig {
221
+ chunking: None,
222
+ ..Default::default()
223
+ };
224
+
225
+ let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
226
+ .await
227
+ .expect("Should extract successfully");
228
+
229
+ assert!(result.chunks.is_none(), "Chunking disabled should produce no chunks");
230
+ }
231
+
232
+ /// Test use_cache true allows results to be cached
233
+ #[tokio::test]
234
+ async fn test_cache_enabled_allows_caching() {
235
+ let text = "Test content for caching";
236
+
237
+ let config = ExtractionConfig {
238
+ use_cache: true,
239
+ ..Default::default()
240
+ };
241
+
242
+ // Extract twice with same content
243
+ let result1 = extract_bytes(text.as_bytes(), "text/plain", &config)
244
+ .await
245
+ .expect("First extraction should succeed");
246
+
247
+ let result2 = extract_bytes(text.as_bytes(), "text/plain", &config)
248
+ .await
249
+ .expect("Second extraction should succeed");
250
+
251
+ // Results should be identical
252
+ assert_eq!(
253
+ result1.content, result2.content,
254
+ "Cache enabled should produce consistent results"
255
+ );
256
+ }
257
+
258
+ /// Test use_cache false disables caching without crashing
259
+ #[tokio::test]
260
+ async fn test_cache_disabled_does_not_crash() {
261
+ let text = "Test content without caching";
262
+
263
+ let config = ExtractionConfig {
264
+ use_cache: false,
265
+ ..Default::default()
266
+ };
267
+
268
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
269
+ .await
270
+ .expect("Extraction with cache disabled should succeed");
271
+
272
+ assert!(!result.content.is_empty(), "Should still extract content");
273
+ }
274
+
275
+ /// Test quality_processing enabled produces quality score
276
+ #[tokio::test]
277
+ #[cfg(feature = "quality")]
278
+ async fn test_quality_processing_enabled_produces_score() {
279
+ let text = "This is a well-structured document. It has proper sentences. And good formatting.";
280
+
281
+ let config = ExtractionConfig {
282
+ enable_quality_processing: true,
283
+ ..Default::default()
284
+ };
285
+
286
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
287
+ .await
288
+ .expect("Should extract successfully");
289
+
290
+ // Quality processing should add a quality_score to metadata
291
+ let has_quality_score = result.metadata.additional.contains_key("quality_score");
292
+ assert!(
293
+ has_quality_score,
294
+ "Quality processing enabled should produce quality_score in metadata"
295
+ );
296
+ }
297
+
298
+ /// Test quality_processing disabled does not produce score
299
+ #[tokio::test]
300
+ #[cfg(feature = "quality")]
301
+ async fn test_quality_processing_disabled_no_score() {
302
+ let text = "This is a document.";
303
+
304
+ let config = ExtractionConfig {
305
+ enable_quality_processing: false,
306
+ ..Default::default()
307
+ };
308
+
309
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
310
+ .await
311
+ .expect("Should extract successfully");
312
+
313
+ assert!(
314
+ !result.metadata.additional.contains_key("quality_score"),
315
+ "Quality processing disabled should not produce quality_score"
316
+ );
317
+ }
318
+
319
+ /// Test output_format combinations with result_format
320
+ #[tokio::test]
321
+ async fn test_output_format_with_element_based() {
322
+ let html = b"<p>First paragraph</p><p>Second paragraph</p>";
323
+
324
+ let config = ExtractionConfig {
325
+ output_format: OutputFormat::Markdown,
326
+ result_format: ResultFormat::ElementBased,
327
+ ..Default::default()
328
+ };
329
+
330
+ let result = extract_bytes(html, "text/html", &config)
331
+ .await
332
+ .expect("Should extract successfully");
333
+
334
+ // Should have elements
335
+ assert!(result.elements.is_some(), "ElementBased format should produce elements");
336
+
337
+ // Content should still be markdown formatted
338
+ assert!(
339
+ !result.content.contains("<p>"),
340
+ "Output format should not contain HTML tags"
341
+ );
342
+ }
343
+
344
+ /// Test chunking respects overlap maximum
345
+ #[tokio::test]
346
+ #[cfg(feature = "chunking")]
347
+ async fn test_chunking_overlap_maximum() {
348
+ let text = "x".repeat(200); // Simple repeated character
349
+
350
+ let config = ExtractionConfig {
351
+ chunking: Some(ChunkingConfig {
352
+ max_chars: 60,
353
+ max_overlap: 10,
354
+ embedding: None,
355
+ preset: None,
356
+ }),
357
+ ..Default::default()
358
+ };
359
+
360
+ let result = extract_bytes(text.as_bytes(), "text/plain", &config)
361
+ .await
362
+ .expect("Should extract successfully");
363
+
364
+ if let Some(chunks) = result.chunks {
365
+ // Verify max_overlap is not exceeded
366
+ for (i, chunk) in chunks.iter().enumerate() {
367
+ assert!(
368
+ chunk.content.len() <= 60 + 10,
369
+ "Chunk {} size {} exceeds max_chars (60) + max_overlap (10)",
370
+ i,
371
+ chunk.content.len()
372
+ );
373
+ }
374
+ }
375
+ }
376
+
377
+ /// Test large document extraction with multiple config options
378
+ #[tokio::test]
379
+ #[cfg(feature = "chunking")]
380
+ async fn test_large_document_with_combined_config() {
381
+ let large_text = "This is a paragraph. ".repeat(100); // ~2000 characters
382
+
383
+ let config = ExtractionConfig {
384
+ output_format: OutputFormat::Plain,
385
+ chunking: Some(ChunkingConfig {
386
+ max_chars: 200,
387
+ max_overlap: 30,
388
+ embedding: None,
389
+ preset: None,
390
+ }),
391
+ use_cache: true,
392
+ enable_quality_processing: true,
393
+ ..Default::default()
394
+ };
395
+
396
+ let result = extract_bytes(large_text.as_bytes(), "text/plain", &config)
397
+ .await
398
+ .expect("Should extract successfully");
399
+
400
+ // Should have chunks due to size
401
+ assert!(result.chunks.is_some(), "Should produce chunks for large text");
402
+
403
+ // Should have quality score
404
+ #[cfg(feature = "quality")]
405
+ {
406
+ assert!(
407
+ result.metadata.additional.contains_key("quality_score"),
408
+ "Should have quality score"
409
+ );
410
+ }
411
+
412
+ // Should have content in plain format
413
+ assert!(!result.content.is_empty(), "Should have content");
414
+ }
@@ -35,14 +35,14 @@ async fn test_chunking_enabled() {
35
35
  .expect("Should extract successfully");
36
36
 
37
37
  assert!(result.chunks.is_some(), "Chunks should be present");
38
- let chunks = result.chunks.unwrap();
38
+ let chunks = result.chunks.expect("Operation failed");
39
39
  assert!(chunks.len() > 1, "Should have multiple chunks");
40
40
 
41
41
  assert!(result.metadata.additional.contains_key("chunk_count"));
42
- let chunk_count = result.metadata.additional.get("chunk_count").unwrap();
42
+ let chunk_count = result.metadata.additional.get("chunk_count").expect("Value not found");
43
43
  assert_eq!(
44
44
  chunks.len(),
45
- chunk_count.as_u64().unwrap() as usize,
45
+ chunk_count.as_u64().expect("Operation failed") as usize,
46
46
  "Chunks length should match chunk_count metadata"
47
47
  );
48
48
 
@@ -78,7 +78,7 @@ async fn test_chunking_with_overlap() {
78
78
  .expect("Should extract successfully");
79
79
 
80
80
  assert!(result.chunks.is_some(), "Chunks should be present");
81
- let chunks = result.chunks.unwrap();
81
+ let chunks = result.chunks.expect("Operation failed");
82
82
  assert!(chunks.len() >= 2, "Should have at least 2 chunks");
83
83
 
84
84
  assert!(result.metadata.additional.contains_key("chunk_count"));
@@ -118,7 +118,7 @@ async fn test_chunking_custom_sizes() {
118
118
  .expect("Should extract successfully");
119
119
 
120
120
  assert!(result.chunks.is_some(), "Chunks should be present");
121
- let chunks = result.chunks.unwrap();
121
+ let chunks = result.chunks.expect("Operation failed");
122
122
  assert!(!chunks.is_empty(), "Should have at least 1 chunk");
123
123
 
124
124
  assert!(result.metadata.additional.contains_key("chunk_count"));
@@ -178,7 +178,7 @@ async fn test_language_detection_single() {
178
178
  .expect("Should extract successfully");
179
179
 
180
180
  assert!(result.detected_languages.is_some(), "Should detect language");
181
- let languages = result.detected_languages.unwrap();
181
+ let languages = result.detected_languages.expect("Operation failed");
182
182
  assert!(!languages.is_empty(), "Should detect at least one language");
183
183
  assert_eq!(languages[0], "eng", "Should detect English");
184
184
  }
@@ -205,7 +205,7 @@ async fn test_language_detection_multiple() {
205
205
  .expect("Should extract successfully");
206
206
 
207
207
  assert!(result.detected_languages.is_some(), "Should detect languages");
208
- let languages = result.detected_languages.unwrap();
208
+ let languages = result.detected_languages.expect("Operation failed");
209
209
  assert!(!languages.is_empty(), "Should detect at least one language");
210
210
  }
211
211
 
@@ -424,7 +424,7 @@ async fn test_quality_processing_enabled() {
424
424
  .expect("Should extract successfully");
425
425
 
426
426
  if let Some(score) = result.metadata.additional.get("quality_score") {
427
- let score_value = score.as_f64().unwrap();
427
+ let score_value = score.as_f64().expect("Operation failed");
428
428
  assert!((0.0..=1.0).contains(&score_value));
429
429
  }
430
430
 
@@ -463,16 +463,16 @@ async fn test_quality_threshold_filtering() {
463
463
  .metadata
464
464
  .additional
465
465
  .get("quality_score")
466
- .unwrap()
466
+ .expect("Operation failed")
467
467
  .as_f64()
468
- .unwrap();
468
+ .expect("Operation failed");
469
469
  let score_low = result_low
470
470
  .metadata
471
471
  .additional
472
472
  .get("quality_score")
473
- .unwrap()
473
+ .expect("Operation failed")
474
474
  .as_f64()
475
- .unwrap();
475
+ .expect("Operation failed");
476
476
 
477
477
  assert!((0.0..=1.0).contains(&score_high));
478
478
  assert!((0.0..=1.0).contains(&score_low));
@@ -528,7 +528,7 @@ async fn test_chunking_with_embeddings() {
528
528
  .expect("Should extract successfully");
529
529
 
530
530
  assert!(result.chunks.is_some(), "Chunks should be present");
531
- let chunks = result.chunks.unwrap();
531
+ let chunks = result.chunks.expect("Operation failed");
532
532
  assert!(chunks.len() > 1, "Should have multiple chunks");
533
533
 
534
534
  println!("Metadata: {:?}", result.metadata.additional);
@@ -542,13 +542,17 @@ async fn test_chunking_with_embeddings() {
542
542
  "Should have embeddings_generated metadata"
543
543
  );
544
544
  assert_eq!(
545
- result.metadata.additional.get("embeddings_generated").unwrap(),
545
+ result
546
+ .metadata
547
+ .additional
548
+ .get("embeddings_generated")
549
+ .expect("Value not found"),
546
550
  &serde_json::Value::Bool(true)
547
551
  );
548
552
 
549
553
  for chunk in &chunks {
550
554
  assert!(chunk.embedding.is_some(), "Each chunk should have an embedding");
551
- let embedding = chunk.embedding.as_ref().unwrap();
555
+ let embedding = chunk.embedding.as_ref().expect("Operation failed");
552
556
  assert_eq!(
553
557
  embedding.len(),
554
558
  768,