kreuzberg 4.1.2 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  5. data/kreuzberg.gemspec +13 -1
  6. data/lib/kreuzberg/config.rb +70 -35
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +5 -1
  9. data/spec/binding/batch_operations_spec.rb +80 -0
  10. data/spec/binding/metadata_types_spec.rb +77 -57
  11. data/spec/serialization_spec.rb +134 -0
  12. data/spec/unit/config/output_format_spec.rb +380 -0
  13. data/vendor/Cargo.toml +1 -1
  14. data/vendor/kreuzberg/Cargo.toml +1 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  17. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  18. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  19. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  20. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  21. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  22. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  23. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  24. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  25. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  26. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  27. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  28. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  29. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  30. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  31. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  32. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  33. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  34. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  35. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  36. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  37. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  38. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  39. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  40. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  41. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  42. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  43. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  44. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  45. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  46. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  47. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  48. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  49. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  50. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  51. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  52. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  53. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  54. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  55. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  56. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  57. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  58. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  59. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  60. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  61. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  62. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  63. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  64. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  65. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  66. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  67. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  68. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  69. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  70. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  71. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  72. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  73. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  74. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  75. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  76. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  77. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  78. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  79. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  80. metadata +10 -2
@@ -11,9 +11,9 @@ fn test_file_path(filename: &str) -> PathBuf {
11
11
  let manifest_dir = env!("CARGO_MANIFEST_DIR");
12
12
  PathBuf::from(manifest_dir)
13
13
  .parent()
14
- .unwrap()
14
+ .expect("Operation failed")
15
15
  .parent()
16
- .unwrap()
16
+ .expect("Operation failed")
17
17
  .join("test_documents")
18
18
  .join("docbook")
19
19
  .join(filename)
@@ -72,7 +72,7 @@ async fn test_docbook4_chapter_extraction() {
72
72
  let result = extract_docbook4_file("docbook-chapter.docbook").await;
73
73
  assert!(result.is_ok(), "Failed to extract DocBook 4 chapter");
74
74
 
75
- let result = result.unwrap();
75
+ let result = result.expect("Operation failed");
76
76
  assert!(!result.content.is_empty(), "Content should not be empty");
77
77
  assert!(
78
78
  result.content.contains("Test Chapter"),
@@ -89,7 +89,7 @@ async fn test_docbook5_reader_extraction() {
89
89
  let result = extract_docbook5_file("docbook-reader.docbook").await;
90
90
  assert!(result.is_ok(), "Failed to extract DocBook 5 file");
91
91
 
92
- let result = result.unwrap();
92
+ let result = result.expect("Operation failed");
93
93
  assert!(!result.content.is_empty(), "Content should not be empty");
94
94
  assert!(
95
95
  result.content.contains("Pandoc Test Suite"),
@@ -102,7 +102,7 @@ async fn test_docbook_xref_extraction() {
102
102
  let result = extract_docbook4_file("docbook-xref.docbook").await;
103
103
  assert!(result.is_ok(), "Failed to extract DocBook with xref elements");
104
104
 
105
- let result = result.unwrap();
105
+ let result = result.expect("Operation failed");
106
106
  assert!(!result.content.is_empty(), "Content should not be empty");
107
107
  assert!(
108
108
  result.content.contains("An Example Book"),
@@ -119,7 +119,7 @@ async fn test_docbook_tables_extraction() {
119
119
  let result = extract_docbook4_file("tables.docbook4").await;
120
120
  assert!(result.is_ok(), "Failed to extract DocBook with tables");
121
121
 
122
- let result = result.unwrap();
122
+ let result = result.expect("Operation failed");
123
123
  assert!(!result.content.is_empty(), "Content should not be empty");
124
124
  assert!(!result.tables.is_empty(), "Should extract tables from DocBook");
125
125
  }
@@ -129,7 +129,7 @@ async fn test_docbook5_tables_extraction() {
129
129
  let result = extract_docbook5_file("tables.docbook5").await;
130
130
  assert!(result.is_ok(), "Failed to extract DocBook 5 with tables");
131
131
 
132
- let result = result.unwrap();
132
+ let result = result.expect("Operation failed");
133
133
  assert!(!result.content.is_empty(), "Content should not be empty");
134
134
  assert!(!result.tables.is_empty(), "Should extract tables from DocBook 5");
135
135
  }
@@ -139,7 +139,7 @@ async fn test_docbook_metadata_extraction() {
139
139
  let result = extract_docbook5_file("docbook-reader.docbook").await;
140
140
  assert!(result.is_ok());
141
141
 
142
- let result = result.unwrap();
142
+ let result = result.expect("Operation failed");
143
143
  assert!(!result.content.is_empty());
144
144
  }
145
145
 
@@ -148,7 +148,7 @@ async fn test_docbook_section_hierarchy() {
148
148
  let result = extract_docbook4_file("docbook-chapter.docbook").await;
149
149
  assert!(result.is_ok());
150
150
 
151
- let result = result.unwrap();
151
+ let result = result.expect("Operation failed");
152
152
  let content = &result.content;
153
153
 
154
154
  assert!(content.contains("Like a Sect1"));
@@ -162,7 +162,7 @@ async fn test_docbook_paragraph_extraction() {
162
162
  let result = extract_docbook4_file("docbook-chapter.docbook").await;
163
163
  assert!(result.is_ok());
164
164
 
165
- let result = result.unwrap();
165
+ let result = result.expect("Operation failed");
166
166
  assert!(
167
167
  result.content.contains("This chapter uses recursive sections"),
168
168
  "Should extract paragraph content"
@@ -183,7 +183,7 @@ async fn test_docbook_paragraph_content() {
183
183
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
184
184
  assert!(result.is_ok());
185
185
 
186
- let result = result.unwrap();
186
+ let result = result.expect("Operation failed");
187
187
  assert!(result.content.contains("Test Article"));
188
188
  assert!(result.content.contains("This is a test paragraph"));
189
189
  assert!(result.content.contains("another paragraph"));
@@ -205,7 +205,7 @@ def hello():
205
205
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
206
206
  assert!(result.is_ok());
207
207
 
208
- let result = result.unwrap();
208
+ let result = result.expect("Operation failed");
209
209
  assert!(result.content.contains("def hello"));
210
210
  assert!(result.content.contains("print"));
211
211
  }
@@ -229,7 +229,7 @@ async fn test_docbook_mixed_content() {
229
229
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
230
230
  assert!(result.is_ok());
231
231
 
232
- let result = result.unwrap();
232
+ let result = result.expect("Operation failed");
233
233
  assert!(result.content.contains("Test Book"));
234
234
  assert!(result.content.contains("Chapter 1"));
235
235
  assert!(result.content.contains("Section 1.1"));
@@ -259,7 +259,7 @@ async fn test_docbook_namespaced_5x_parsing() {
259
259
  let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
260
260
  assert!(result.is_ok());
261
261
 
262
- let result = result.unwrap();
262
+ let result = result.expect("Operation failed");
263
263
  assert!(result.content.contains("DocBook 5 Article"));
264
264
  assert!(result.content.contains("Welcome to DocBook 5"));
265
265
  }
@@ -277,7 +277,7 @@ async fn test_docbook_link_handling() {
277
277
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
278
278
  assert!(result.is_ok());
279
279
 
280
- let result = result.unwrap();
280
+ let result = result.expect("Operation failed");
281
281
  assert!(result.content.contains("example"));
282
282
  }
283
283
 
@@ -316,7 +316,7 @@ async fn test_docbook_empty_sections() {
316
316
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
317
317
  assert!(result.is_ok());
318
318
 
319
- let result = result.unwrap();
319
+ let result = result.expect("Operation failed");
320
320
  assert!(result.content.contains("Empty Section"));
321
321
  assert!(result.content.contains("Section with Content"));
322
322
  assert!(result.content.contains("Content here"));
@@ -345,7 +345,7 @@ async fn test_docbook_itemized_list() {
345
345
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
346
346
  assert!(result.is_ok());
347
347
 
348
- let result = result.unwrap();
348
+ let result = result.expect("Operation failed");
349
349
  assert!(result.content.contains("First item"));
350
350
  assert!(result.content.contains("Second item"));
351
351
  assert!(result.content.contains("Third item"));
@@ -375,7 +375,7 @@ async fn test_docbook_ordered_list() {
375
375
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
376
376
  assert!(result.is_ok());
377
377
 
378
- let result = result.unwrap();
378
+ let result = result.expect("Operation failed");
379
379
  assert!(result.content.contains("First step"));
380
380
  assert!(result.content.contains("Second step"));
381
381
  assert!(result.content.contains("Third step"));
@@ -397,7 +397,7 @@ async fn test_docbook_blockquote() {
397
397
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
398
398
  assert!(result.is_ok());
399
399
 
400
- let result = result.unwrap();
400
+ let result = result.expect("Operation failed");
401
401
  assert!(result.content.contains("quoted passage"));
402
402
  assert!(result.content.contains("> "), "Should contain blockquote marker");
403
403
  }
@@ -418,7 +418,7 @@ async fn test_docbook_figure() {
418
418
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
419
419
  assert!(result.is_ok());
420
420
 
421
- let result = result.unwrap();
421
+ let result = result.expect("Operation failed");
422
422
  assert!(result.content.contains("Figure"));
423
423
  }
424
424
 
@@ -435,7 +435,7 @@ async fn test_docbook_footnote() {
435
435
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
436
436
  assert!(result.is_ok());
437
437
 
438
- let result = result.unwrap();
438
+ let result = result.expect("Operation failed");
439
439
  assert!(result.content.contains("text with a footnote"));
440
440
  assert!(result.content.contains("footnote content"));
441
441
  }
@@ -465,7 +465,7 @@ code example
465
465
  let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
466
466
  assert!(result.is_ok());
467
467
 
468
- let result = result.unwrap();
468
+ let result = result.expect("Operation failed");
469
469
  assert!(result.content.contains("Introduction paragraph"));
470
470
  assert!(result.content.contains("List item 1"));
471
471
  assert!(result.content.contains("List item 2"));
@@ -493,7 +493,7 @@ async fn test_docbook_namespaced_lists() {
493
493
  let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
494
494
  assert!(result.is_ok());
495
495
 
496
- let result = result.unwrap();
496
+ let result = result.expect("Operation failed");
497
497
  assert!(result.content.contains("Namespaced item 1"));
498
498
  assert!(result.content.contains("Namespaced item 2"));
499
499
  assert!(result.content.contains("- "));
@@ -8,9 +8,9 @@ use kreuzberg::{ExtractionConfig, extract_file};
8
8
  async fn test_docx_full_metadata_extraction() {
9
9
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
10
10
  .parent()
11
- .unwrap()
11
+ .expect("Operation failed")
12
12
  .parent()
13
- .unwrap();
13
+ .expect("Operation failed");
14
14
  let test_file = workspace_root.join("test_documents/documents/word_sample.docx");
15
15
 
16
16
  if !test_file.exists() {
@@ -91,9 +91,9 @@ async fn test_docx_full_metadata_extraction() {
91
91
  async fn test_docx_minimal_metadata_extraction() {
92
92
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
93
93
  .parent()
94
- .unwrap()
94
+ .expect("Operation failed")
95
95
  .parent()
96
- .unwrap();
96
+ .expect("Operation failed");
97
97
  let test_file = workspace_root.join("test_documents/documents/lorem_ipsum.docx");
98
98
 
99
99
  if !test_file.exists() {
@@ -143,25 +143,26 @@ async fn test_docx_keywords_extraction() {
143
143
  let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
144
144
 
145
145
  // Add [Content_Types].xml
146
- zip.start_file("[Content_Types].xml", options).unwrap();
146
+ zip.start_file("[Content_Types].xml", options)
147
+ .expect("Operation failed");
147
148
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
148
149
  <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
149
150
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
150
151
  <Default Extension="xml" ContentType="application/xml"/>
151
152
  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
152
153
  <Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
153
- </Types>"#).unwrap();
154
+ </Types>"#).expect("Operation failed");
154
155
 
155
156
  // Add _rels/.rels
156
- zip.start_file("_rels/.rels", options).unwrap();
157
+ zip.start_file("_rels/.rels", options).expect("Operation failed");
157
158
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
158
159
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
159
160
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
160
161
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
161
- </Relationships>"#).unwrap();
162
+ </Relationships>"#).expect("Operation failed");
162
163
 
163
164
  // Add word/document.xml with simple content
164
- zip.start_file("word/document.xml", options).unwrap();
165
+ zip.start_file("word/document.xml", options).expect("Operation failed");
165
166
  zip.write_all(
166
167
  br#"<?xml version="1.0" encoding="UTF-8"?>
167
168
  <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
@@ -174,10 +175,10 @@ async fn test_docx_keywords_extraction() {
174
175
  </w:body>
175
176
  </w:document>"#,
176
177
  )
177
- .unwrap();
178
+ .expect("Operation failed");
178
179
 
179
180
  // Add docProps/core.xml with keywords (comma-separated string)
180
- zip.start_file("docProps/core.xml", options).unwrap();
181
+ zip.start_file("docProps/core.xml", options).expect("Operation failed");
181
182
  zip.write_all(
182
183
  br#"<?xml version="1.0" encoding="UTF-8"?>
183
184
  <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
@@ -189,9 +190,9 @@ async fn test_docx_keywords_extraction() {
189
190
  <dc:subject>Testing keyword extraction</dc:subject>
190
191
  </cp:coreProperties>"#,
191
192
  )
192
- .unwrap();
193
+ .expect("Operation failed");
193
194
 
194
- zip.finish().unwrap();
195
+ zip.finish().expect("Operation failed");
195
196
  }
196
197
 
197
198
  // Extract the DOCX file
@@ -216,7 +217,7 @@ async fn test_docx_keywords_extraction() {
216
217
  "Keywords should be present in metadata.keywords"
217
218
  );
218
219
 
219
- let keywords = result.metadata.keywords.as_ref().unwrap();
220
+ let keywords = result.metadata.keywords.as_ref().expect("Operation failed");
220
221
  assert_eq!(
221
222
  keywords.len(),
222
223
  5,
@@ -10,9 +10,9 @@ use kreuzberg::plugins::DocumentExtractor;
10
10
  async fn test_docx_kreuzberg_vs_pandoc_comparison() {
11
11
  let docx_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
12
12
  .parent()
13
- .unwrap()
13
+ .expect("Operation failed")
14
14
  .parent()
15
- .unwrap()
15
+ .expect("Operation failed")
16
16
  .join("test_documents/documents/word_sample.docx");
17
17
 
18
18
  if !docx_path.exists() {
@@ -319,9 +319,9 @@ Here are some interesting things a respectful duck could eat:
319
319
  async fn test_docx_lorem_ipsum_comparison() {
320
320
  let docx_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
321
321
  .parent()
322
- .unwrap()
322
+ .expect("Operation failed")
323
323
  .parent()
324
- .unwrap()
324
+ .expect("Operation failed")
325
325
  .join("test_documents/documents/lorem_ipsum.docx");
326
326
 
327
327
  if !docx_path.exists() {
@@ -32,7 +32,7 @@ This is the email body content.";
32
32
  assert_eq!(result.metadata.subject, Some("Test Email Subject".to_string()));
33
33
 
34
34
  assert!(result.metadata.format.is_some());
35
- let email_meta = match result.metadata.format.as_ref().unwrap() {
35
+ let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
36
36
  kreuzberg::FormatMetadata::Email(meta) => meta,
37
37
  _ => panic!("Expected Email metadata"),
38
38
  };
@@ -44,7 +44,7 @@ This is the email body content.";
44
44
  assert!(email_meta.bcc_emails.is_empty(), "BCC should be empty");
45
45
 
46
46
  assert!(email_meta.message_id.is_some());
47
- let msg_id = email_meta.message_id.clone().unwrap();
47
+ let msg_id = email_meta.message_id.clone().expect("Operation failed");
48
48
  assert!(
49
49
  msg_id.contains("unique123@example.com"),
50
50
  "Message ID should contain unique123@example.com"
@@ -86,7 +86,7 @@ Attachment content here.\r\n\
86
86
  .expect("Should extract EML with attachment");
87
87
 
88
88
  assert!(result.metadata.format.is_some());
89
- let email_meta = match result.metadata.format.as_ref().unwrap() {
89
+ let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
90
90
  kreuzberg::FormatMetadata::Email(meta) => meta,
91
91
  _ => panic!("Expected Email metadata"),
92
92
  };
@@ -127,7 +127,7 @@ Content-Type: text/html; charset=utf-8\r\n\
127
127
  assert!(result.content.contains("HTML Heading") || result.content.contains("bold"));
128
128
 
129
129
  assert!(result.metadata.format.is_some());
130
- let email_meta = match result.metadata.format.as_ref().unwrap() {
130
+ let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
131
131
  kreuzberg::FormatMetadata::Email(meta) => meta,
132
132
  _ => panic!("Expected Email metadata"),
133
133
  };
@@ -159,7 +159,7 @@ And preserves formatting.";
159
159
  assert!(result.content.contains("preserves formatting"));
160
160
 
161
161
  assert!(result.metadata.format.is_some());
162
- let email_meta = match result.metadata.format.as_ref().unwrap() {
162
+ let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
163
163
  kreuzberg::FormatMetadata::Email(meta) => meta,
164
164
  _ => panic!("Expected Email metadata"),
165
165
  };
@@ -198,7 +198,7 @@ Content-Type: text/html\r\n\
198
198
  );
199
199
 
200
200
  assert!(result.metadata.format.is_some());
201
- let email_meta = match result.metadata.format.as_ref().unwrap() {
201
+ let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
202
202
  kreuzberg::FormatMetadata::Email(meta) => meta,
203
203
  _ => panic!("Expected Email metadata"),
204
204
  };
@@ -290,7 +290,7 @@ Email to multiple recipients.";
290
290
  .expect("Should extract email with multiple recipients");
291
291
 
292
292
  assert!(result.metadata.format.is_some());
293
- let email_meta = match result.metadata.format.as_ref().unwrap() {
293
+ let email_meta = match result.metadata.format.as_ref().expect("Operation failed") {
294
294
  kreuzberg::FormatMetadata::Email(meta) => meta,
295
295
  _ => panic!("Expected Email metadata"),
296
296
  };
@@ -17,9 +17,9 @@ use std::path::PathBuf;
17
17
  fn get_test_epub_path(filename: &str) -> PathBuf {
18
18
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
19
19
  .parent()
20
- .unwrap()
20
+ .expect("Operation failed")
21
21
  .parent()
22
- .unwrap();
22
+ .expect("Operation failed");
23
23
  workspace_root.join(format!("test_documents/epub/{}", filename))
24
24
  }
25
25
 
@@ -187,7 +187,7 @@ async fn test_very_large_file() {
187
187
  let result = extract_bytes(large_bytes, "text/plain", &config).await;
188
188
 
189
189
  assert!(result.is_ok(), "Large file should be processed successfully");
190
- let extraction = result.unwrap();
190
+ let extraction = result.expect("Operation failed");
191
191
 
192
192
  assert!(!extraction.content.is_empty(), "Large file content should not be empty");
193
193
  assert!(extraction.content.len() > 1_000_000, "Content should be large");
@@ -213,12 +213,14 @@ async fn test_unicode_filenames() {
213
213
  let config = ExtractionConfig::default();
214
214
 
215
215
  let mut temp_file = NamedTempFile::new().expect("Should create temp file");
216
- temp_file.write_all(b"Test content with Unicode filename.").unwrap();
216
+ temp_file
217
+ .write_all(b"Test content with Unicode filename.")
218
+ .expect("Operation failed");
217
219
 
218
220
  let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
219
221
 
220
222
  assert!(result.is_ok(), "Unicode filename should be handled");
221
- let extraction = result.unwrap();
223
+ let extraction = result.expect("Operation failed");
222
224
 
223
225
  assert!(
224
226
  extraction.content.contains("Test content"),
@@ -249,7 +251,7 @@ Math symbols: ∑ ∫ √ ≈ ∞";
249
251
  let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
250
252
 
251
253
  assert!(result.is_ok(), "Special characters should be handled");
252
- let extraction = result.unwrap();
254
+ let extraction = result.expect("Operation failed");
253
255
 
254
256
  assert!(!extraction.content.is_empty(), "Content should not be empty");
255
257
  assert!(extraction.content.len() > 10, "Should have substantial content");
@@ -319,17 +321,17 @@ async fn test_permission_denied() {
319
321
  let config = ExtractionConfig::default();
320
322
 
321
323
  let mut temp_file = NamedTempFile::new().expect("Should create temp file");
322
- temp_file.write_all(b"Test content").unwrap();
324
+ temp_file.write_all(b"Test content").expect("Operation failed");
323
325
 
324
- let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
326
+ let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
325
327
  perms.set_mode(0o000);
326
- fs::set_permissions(temp_file.path(), perms).unwrap();
328
+ fs::set_permissions(temp_file.path(), perms).expect("Operation failed");
327
329
 
328
330
  let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
329
331
 
330
- let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
332
+ let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
331
333
  perms.set_mode(0o644);
332
- fs::set_permissions(temp_file.path(), perms).unwrap();
334
+ fs::set_permissions(temp_file.path(), perms).expect("Operation failed");
333
335
 
334
336
  assert!(result.is_err(), "Permission denied should return error");
335
337
  }
@@ -356,7 +358,7 @@ async fn test_null_bytes_in_content() {
356
358
  let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
357
359
 
358
360
  assert!(result.is_ok(), "Null bytes should be handled");
359
- let extraction = result.unwrap();
361
+ let extraction = result.expect("Operation failed");
360
362
 
361
363
  assert!(!extraction.content.is_empty(), "Content should not be empty");
362
364
  assert!(
@@ -388,7 +390,7 @@ async fn test_concurrent_extractions() {
388
390
  let result = handle.await.expect("Task should complete");
389
391
  assert!(result.is_ok(), "Concurrent extraction should succeed");
390
392
 
391
- let extraction = result.unwrap();
393
+ let extraction = result.expect("Operation failed");
392
394
  assert!(
393
395
  extraction.content.contains("Concurrent extraction"),
394
396
  "Content should be extracted correctly"
@@ -9,9 +9,9 @@ fn test_file_path(filename: &str) -> PathBuf {
9
9
  let manifest_dir = env!("CARGO_MANIFEST_DIR");
10
10
  PathBuf::from(manifest_dir)
11
11
  .parent()
12
- .unwrap()
12
+ .expect("Operation failed")
13
13
  .parent()
14
- .unwrap()
14
+ .expect("Operation failed")
15
15
  .join("test_documents")
16
16
  .join("fictionbook")
17
17
  .join(filename)
@@ -37,7 +37,7 @@ mod html_table_tests {
37
37
  let result = convert_html_to_markdown(html, None, None);
38
38
  assert!(result.is_ok(), "HTML to markdown conversion should succeed");
39
39
 
40
- let markdown = result.unwrap();
40
+ let markdown = result.expect("Operation failed");
41
41
 
42
42
  println!("=== Basic Table Test ===");
43
43
  println!("Input HTML:\n{}", html);
@@ -79,7 +79,7 @@ mod html_table_tests {
79
79
  let result = convert_html_to_markdown(html, None, None);
80
80
  assert!(result.is_ok(), "Should convert to markdown");
81
81
 
82
- let markdown = result.unwrap();
82
+ let markdown = result.expect("Operation failed");
83
83
 
84
84
  println!("=== Table Format Test ===");
85
85
  println!("Input HTML:\n{}", html);
@@ -143,7 +143,7 @@ mod html_table_tests {
143
143
  let result = convert_html_to_markdown(html, None, None);
144
144
  assert!(result.is_ok(), "Should convert complex table");
145
145
 
146
- let markdown = result.unwrap();
146
+ let markdown = result.expect("Operation failed");
147
147
 
148
148
  println!("=== Complex Table Test ===");
149
149
  println!("Input HTML:\n{}", html);
@@ -194,7 +194,7 @@ mod html_table_tests {
194
194
  let result = convert_html_to_markdown(html, None, None);
195
195
  assert!(result.is_ok(), "Should handle merged cell table");
196
196
 
197
- let markdown = result.unwrap();
197
+ let markdown = result.expect("Operation failed");
198
198
 
199
199
  println!("=== Merged Cells Test ===");
200
200
  println!("Input HTML:\n{}", html);
@@ -248,7 +248,7 @@ mod html_table_tests {
248
248
  let result = convert_html_to_markdown(html, None, None);
249
249
  assert!(result.is_ok(), "Should handle multiple tables");
250
250
 
251
- let markdown = result.unwrap();
251
+ let markdown = result.expect("Operation failed");
252
252
 
253
253
  println!("=== Multiple Tables Test ===");
254
254
  println!("Input HTML:\n{}", html);
@@ -303,7 +303,7 @@ mod html_table_tests {
303
303
  let result = convert_html_to_markdown(html, None, None);
304
304
  assert!(result.is_ok(), "Should handle mixed header cells");
305
305
 
306
- let markdown = result.unwrap();
306
+ let markdown = result.expect("Operation failed");
307
307
 
308
308
  println!("=== Mixed Header Cells Test ===");
309
309
  println!("Input HTML:\n{}", html);
@@ -349,7 +349,7 @@ mod html_table_tests {
349
349
  let result = convert_html_to_markdown(html, None, None);
350
350
  assert!(result.is_ok(), "Should handle table with caption");
351
351
 
352
- let markdown = result.unwrap();
352
+ let markdown = result.expect("Operation failed");
353
353
 
354
354
  println!("=== Table with Caption Test ===");
355
355
  println!("Input HTML:\n{}", html);
@@ -385,7 +385,7 @@ mod html_table_tests {
385
385
  let result = convert_html_to_markdown(html, None, None);
386
386
  assert!(result.is_ok(), "Should handle flat table");
387
387
 
388
- let markdown = result.unwrap();
388
+ let markdown = result.expect("Operation failed");
389
389
 
390
390
  println!("=== Simple Flat Table Test ===");
391
391
  println!("Input HTML:\n{}", html);
@@ -421,7 +421,7 @@ mod html_table_tests {
421
421
  let result = convert_html_to_markdown(html, None, None);
422
422
  assert!(result.is_ok(), "Should handle empty cells");
423
423
 
424
- let markdown = result.unwrap();
424
+ let markdown = result.expect("Operation failed");
425
425
 
426
426
  println!("=== Empty Cells Test ===");
427
427
  println!("Input HTML:\n{}", html);
@@ -459,7 +459,7 @@ mod html_table_tests {
459
459
  let result = convert_html_to_markdown(html, None, None);
460
460
  assert!(result.is_ok(), "Should handle numeric table");
461
461
 
462
- let markdown = result.unwrap();
462
+ let markdown = result.expect("Operation failed");
463
463
 
464
464
  println!("=== Numeric Data Test ===");
465
465
  println!("Input HTML:\n{}", html);
@@ -502,7 +502,7 @@ mod html_table_tests {
502
502
  let result = convert_html_to_markdown(html, None, None);
503
503
  assert!(result.is_ok(), "Should handle unicode characters");
504
504
 
505
- let markdown = result.unwrap();
505
+ let markdown = result.expect("Operation failed");
506
506
 
507
507
  println!("=== Special Characters Test ===");
508
508
  println!("Input HTML:\n{}", html);
@@ -17,7 +17,10 @@ struct SpanCollector {
17
17
 
18
18
  impl<S: Subscriber + for<'a> LookupSpan<'a>> Layer<S> for SpanCollector {
19
19
  fn on_new_span(&self, attrs: &Attributes<'_>, _id: &Id, _ctx: Context<'_, S>) {
20
- self.spans.lock().unwrap().push(attrs.metadata().name().to_string());
20
+ self.spans
21
+ .lock()
22
+ .expect("Operation failed")
23
+ .push(attrs.metadata().name().to_string());
21
24
  }
22
25
  }
23
26
 
@@ -32,21 +35,23 @@ async fn test_cache_instrumentation() {
32
35
  let subscriber = tracing_subscriber::registry().with(collector);
33
36
  let _guard = tracing::subscriber::set_default(subscriber);
34
37
 
35
- let temp_dir = tempdir().unwrap();
38
+ let temp_dir = tempdir().expect("Operation failed");
36
39
  let cache = GenericCache::new(
37
40
  "test".to_string(),
38
- Some(temp_dir.path().to_str().unwrap().to_string()),
41
+ Some(temp_dir.path().to_str().expect("Operation failed").to_string()),
39
42
  30.0,
40
43
  500.0,
41
44
  1000.0,
42
45
  )
43
- .unwrap();
46
+ .expect("Operation failed");
44
47
 
45
- cache.set("test_key", b"test data".to_vec(), None).unwrap();
48
+ cache
49
+ .set("test_key", b"test data".to_vec(), None)
50
+ .expect("Operation failed");
46
51
 
47
- let _ = cache.get("test_key", None).unwrap();
52
+ let _ = cache.get("test_key", None).expect("Value not found");
48
53
 
49
- let span_names = spans.lock().unwrap();
54
+ let span_names = spans.lock().expect("Operation failed");
50
55
  assert!(span_names.contains(&"set".to_string()), "Expected 'set' span");
51
56
  assert!(span_names.contains(&"get".to_string()), "Expected 'get' span");
52
57
  }
@@ -64,13 +69,13 @@ async fn test_ocr_instrumentation() {
64
69
  let subscriber = tracing_subscriber::registry().with(collector);
65
70
  let _guard = tracing::subscriber::set_default(subscriber);
66
71
 
67
- let temp_dir = tempdir().unwrap();
68
- let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
72
+ let temp_dir = tempdir().expect("Operation failed");
73
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).expect("Operation failed");
69
74
 
70
75
  let mut test_image = Vec::new();
71
76
  let img = image::ImageBuffer::from_fn(1, 1, |_, _| image::Rgb([255u8, 255u8, 255u8]));
72
77
  img.write_to(&mut std::io::Cursor::new(&mut test_image), image::ImageFormat::Png)
73
- .unwrap();
78
+ .expect("Operation failed");
74
79
 
75
80
  let config = TesseractConfig {
76
81
  output_format: "text".to_string(),
@@ -80,7 +85,7 @@ async fn test_ocr_instrumentation() {
80
85
 
81
86
  let _ = processor.process_image(&test_image, &config);
82
87
 
83
- let span_names = spans.lock().unwrap();
88
+ let span_names = spans.lock().expect("Operation failed");
84
89
  assert!(
85
90
  span_names.contains(&"process_image".to_string()),
86
91
  "Expected 'process_image' span"
@@ -101,7 +106,7 @@ async fn test_registry_instrumentation() {
101
106
 
102
107
  let _ = registry.get("application/pdf");
103
108
 
104
- let span_names = spans.lock().unwrap();
109
+ let span_names = spans.lock().expect("Operation failed");
105
110
  assert!(
106
111
  span_names.contains(&"get".to_string()),
107
112
  "Expected 'get' span from registry"
@@ -125,7 +130,7 @@ async fn test_span_hierarchy() {
125
130
 
126
131
  let _ = extract_bytes(test_content, "text/plain", &config).await;
127
132
 
128
- let span_names = spans.lock().unwrap();
133
+ let span_names = spans.lock().expect("Operation failed");
129
134
  assert!(
130
135
  span_names.contains(&"extract_bytes".to_string()),
131
136
  "Expected 'extract_bytes' span"