kreuzberg 4.1.2 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
|
@@ -120,7 +120,7 @@ async fn test_concurrent_batch_extractions() {
|
|
|
120
120
|
for handle in handles {
|
|
121
121
|
let results = handle.await.expect("Task should not panic");
|
|
122
122
|
assert!(results.is_ok(), "Batch extraction should succeed");
|
|
123
|
-
let results = results.
|
|
123
|
+
let results = results.expect("Operation failed");
|
|
124
124
|
assert_eq!(results.len(), 20, "Should return all results");
|
|
125
125
|
}
|
|
126
126
|
}
|
|
@@ -147,7 +147,9 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
147
147
|
|
|
148
148
|
let test_data = b"Cached content for concurrent access test";
|
|
149
149
|
|
|
150
|
-
let _ = extract_bytes(test_data, "text/plain", &config)
|
|
150
|
+
let _ = extract_bytes(test_data, "text/plain", &config)
|
|
151
|
+
.await
|
|
152
|
+
.expect("Async operation failed");
|
|
151
153
|
|
|
152
154
|
let mut handles = vec![];
|
|
153
155
|
for _ in 0..100 {
|
|
@@ -163,7 +165,7 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
163
165
|
for handle in handles {
|
|
164
166
|
let result = handle.await.expect("Task should not panic");
|
|
165
167
|
assert!(result.is_ok(), "Cache read should succeed");
|
|
166
|
-
let extraction = result.
|
|
168
|
+
let extraction = result.expect("Operation failed");
|
|
167
169
|
assert_text_content(&extraction.content, expected_content);
|
|
168
170
|
}
|
|
169
171
|
}
|
|
@@ -225,7 +227,7 @@ async fn test_concurrent_ocr_processing() {
|
|
|
225
227
|
let mut extracted_texts = vec![];
|
|
226
228
|
for result in results {
|
|
227
229
|
assert!(result.is_ok(), "OCR should succeed: {:?}", result.err());
|
|
228
|
-
let extraction = result.
|
|
230
|
+
let extraction = result.expect("Operation failed");
|
|
229
231
|
assert!(!extraction.content.is_empty(), "OCR should extract text");
|
|
230
232
|
extracted_texts.push(extraction.content);
|
|
231
233
|
}
|
|
@@ -394,7 +396,7 @@ async fn test_concurrent_pipeline_processing() {
|
|
|
394
396
|
for handle in handles {
|
|
395
397
|
let result = handle.await.expect("Task should not panic");
|
|
396
398
|
assert!(result.is_ok(), "Pipeline should succeed");
|
|
397
|
-
let processed = result.
|
|
399
|
+
let processed = result.expect("Operation failed");
|
|
398
400
|
assert!(processed.content.contains("[processed]"), "Processor should run");
|
|
399
401
|
}
|
|
400
402
|
|
|
@@ -457,7 +459,9 @@ async fn test_extraction_throughput_scales() {
|
|
|
457
459
|
|
|
458
460
|
let sequential_start = std::time::Instant::now();
|
|
459
461
|
for _ in 0..20 {
|
|
460
|
-
let _ = extract_bytes(test_data, "text/plain", &config)
|
|
462
|
+
let _ = extract_bytes(test_data, "text/plain", &config)
|
|
463
|
+
.await
|
|
464
|
+
.expect("Async operation failed");
|
|
461
465
|
}
|
|
462
466
|
let sequential_duration = sequential_start.elapsed();
|
|
463
467
|
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
//! Config behavioral verification tests
|
|
2
|
+
//!
|
|
3
|
+
//! These tests verify that configuration options actually affect extraction behavior,
|
|
4
|
+
//! not just that they serialize correctly.
|
|
5
|
+
//!
|
|
6
|
+
//! Unlike serialization tests that only check if configs deserialize, these tests verify
|
|
7
|
+
//! that the configuration options actually influence the extraction process and produce
|
|
8
|
+
//! observable differences in the output.
|
|
9
|
+
|
|
10
|
+
use kreuzberg::core::config::ChunkingConfig;
|
|
11
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
12
|
+
use kreuzberg::core::config::OutputFormat;
|
|
13
|
+
use kreuzberg::core::extractor::extract_bytes;
|
|
14
|
+
use kreuzberg::types::OutputFormat as ResultFormat;
|
|
15
|
+
|
|
16
|
+
mod helpers;
|
|
17
|
+
|
|
18
|
+
/// Test output_format Plain produces text without formatting
|
|
19
|
+
///
|
|
20
|
+
/// Note: HTML extractors often convert to markdown internally, so this test
|
|
21
|
+
/// uses plain text input to verify the output_format configuration is respected.
|
|
22
|
+
#[tokio::test]
|
|
23
|
+
async fn test_output_format_plain_produces_plain() {
|
|
24
|
+
let plain_text = b"Title\n\nParagraph with bold text.";
|
|
25
|
+
|
|
26
|
+
let config = ExtractionConfig {
|
|
27
|
+
output_format: OutputFormat::Plain,
|
|
28
|
+
..Default::default()
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
let result = extract_bytes(plain_text, "text/plain", &config)
|
|
32
|
+
.await
|
|
33
|
+
.expect("Should extract successfully");
|
|
34
|
+
|
|
35
|
+
// Plain text should not have markdown or HTML formatting
|
|
36
|
+
assert!(
|
|
37
|
+
!result.content.contains("# ") && !result.content.contains("<h1>"),
|
|
38
|
+
"Plain format should not contain markdown headers or HTML tags, got: {}",
|
|
39
|
+
result.content
|
|
40
|
+
);
|
|
41
|
+
assert!(
|
|
42
|
+
result.content.contains("Title") || result.content.contains("Paragraph"),
|
|
43
|
+
"Should still contain extracted text content"
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Test output_format Markdown produces markdown formatting
|
|
48
|
+
#[tokio::test]
|
|
49
|
+
async fn test_output_format_markdown_produces_markdown() {
|
|
50
|
+
let html = b"<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";
|
|
51
|
+
|
|
52
|
+
let config = ExtractionConfig {
|
|
53
|
+
output_format: OutputFormat::Markdown,
|
|
54
|
+
..Default::default()
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
let result = extract_bytes(html, "text/html", &config)
|
|
58
|
+
.await
|
|
59
|
+
.expect("Should extract successfully");
|
|
60
|
+
|
|
61
|
+
// Verify markdown formatting is present (# for headers or ** for bold)
|
|
62
|
+
let has_markdown = result.content.contains("# ") || result.content.contains("**") || result.content.contains("*");
|
|
63
|
+
|
|
64
|
+
assert!(
|
|
65
|
+
has_markdown,
|
|
66
|
+
"Markdown format should contain # headers or ** bold, got: {}",
|
|
67
|
+
result.content
|
|
68
|
+
);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/// Test output_format HTML produces valid HTML content
|
|
72
|
+
#[tokio::test]
|
|
73
|
+
async fn test_output_format_html_produces_html() {
|
|
74
|
+
let text = "Title\n\nParagraph with bold text.";
|
|
75
|
+
|
|
76
|
+
let config = ExtractionConfig {
|
|
77
|
+
output_format: OutputFormat::Html,
|
|
78
|
+
..Default::default()
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
82
|
+
.await
|
|
83
|
+
.expect("Should extract successfully");
|
|
84
|
+
|
|
85
|
+
// HTML format should be safe and not contain injection vectors
|
|
86
|
+
assert!(
|
|
87
|
+
!result.content.contains("<script>"),
|
|
88
|
+
"HTML format should be safe from injection"
|
|
89
|
+
);
|
|
90
|
+
assert!(!result.content.is_empty(), "Should produce content in HTML format");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/// Test result_format Unified produces content in single field
|
|
94
|
+
#[tokio::test]
|
|
95
|
+
async fn test_result_format_unified_structure() {
|
|
96
|
+
let text = "Sample content";
|
|
97
|
+
|
|
98
|
+
let config = ExtractionConfig {
|
|
99
|
+
result_format: ResultFormat::Unified,
|
|
100
|
+
..Default::default()
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
104
|
+
.await
|
|
105
|
+
.expect("Should extract successfully");
|
|
106
|
+
|
|
107
|
+
// Unified format should have content in main content field
|
|
108
|
+
assert!(!result.content.is_empty(), "Unified format should have content");
|
|
109
|
+
|
|
110
|
+
// Elements should be None or empty for unified format
|
|
111
|
+
assert!(
|
|
112
|
+
result.elements.is_none() || result.elements.as_ref().unwrap().is_empty(),
|
|
113
|
+
"Unified format should not have elements"
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/// Test result_format ElementBased produces element structure
|
|
118
|
+
#[tokio::test]
|
|
119
|
+
async fn test_result_format_element_based_structure() {
|
|
120
|
+
let text = "First paragraph here.\n\nSecond paragraph with more content.";
|
|
121
|
+
|
|
122
|
+
let config = ExtractionConfig {
|
|
123
|
+
result_format: ResultFormat::ElementBased,
|
|
124
|
+
..Default::default()
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
128
|
+
.await
|
|
129
|
+
.expect("Should extract successfully");
|
|
130
|
+
|
|
131
|
+
// Element-based format should produce elements array
|
|
132
|
+
if let Some(elements) = &result.elements {
|
|
133
|
+
assert!(!elements.is_empty(), "Element-based format should have elements");
|
|
134
|
+
// Verify elements have expected structure
|
|
135
|
+
for element in elements {
|
|
136
|
+
assert!(!element.text.is_empty(), "Elements should have non-empty text");
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Test chunking max_chars actually limits chunk size
|
|
142
|
+
#[tokio::test]
|
|
143
|
+
#[cfg(feature = "chunking")]
|
|
144
|
+
async fn test_chunking_max_chars_limits_chunk_size() {
|
|
145
|
+
let long_text = "word ".repeat(500); // ~2500 characters
|
|
146
|
+
|
|
147
|
+
let config = ExtractionConfig {
|
|
148
|
+
chunking: Some(ChunkingConfig {
|
|
149
|
+
max_chars: 100,
|
|
150
|
+
max_overlap: 20,
|
|
151
|
+
embedding: None,
|
|
152
|
+
preset: None,
|
|
153
|
+
}),
|
|
154
|
+
..Default::default()
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
|
|
158
|
+
.await
|
|
159
|
+
.expect("Should extract successfully");
|
|
160
|
+
|
|
161
|
+
assert!(result.chunks.is_some(), "Chunking should produce chunks");
|
|
162
|
+
|
|
163
|
+
if let Some(chunks) = result.chunks {
|
|
164
|
+
assert!(chunks.len() > 1, "Long text should produce multiple chunks");
|
|
165
|
+
|
|
166
|
+
// Verify chunk size constraint: each chunk should respect max_chars
|
|
167
|
+
for (i, chunk) in chunks.iter().enumerate() {
|
|
168
|
+
assert!(
|
|
169
|
+
chunk.content.len() <= 100 + 20,
|
|
170
|
+
"Chunk {} exceeds max_chars + overlap: length = {}",
|
|
171
|
+
i,
|
|
172
|
+
chunk.content.len()
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/// Test chunking with overlap creates overlapping chunks
|
|
179
|
+
#[tokio::test]
|
|
180
|
+
#[cfg(feature = "chunking")]
|
|
181
|
+
async fn test_chunking_overlap_creates_overlap() {
|
|
182
|
+
let text = "First sentence. ".repeat(30); // ~480 characters
|
|
183
|
+
|
|
184
|
+
let config = ExtractionConfig {
|
|
185
|
+
chunking: Some(ChunkingConfig {
|
|
186
|
+
max_chars: 50,
|
|
187
|
+
max_overlap: 15,
|
|
188
|
+
embedding: None,
|
|
189
|
+
preset: None,
|
|
190
|
+
}),
|
|
191
|
+
..Default::default()
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
195
|
+
.await
|
|
196
|
+
.expect("Should extract successfully");
|
|
197
|
+
|
|
198
|
+
if let Some(chunks) = result.chunks {
|
|
199
|
+
if chunks.len() >= 2 {
|
|
200
|
+
// Check if adjacent chunks have overlapping text
|
|
201
|
+
let chunk1_end = &chunks[0].content[chunks[0].content.len().saturating_sub(15)..];
|
|
202
|
+
let chunk2_start = &chunks[1].content[..chunks[1].content.len().min(15)];
|
|
203
|
+
|
|
204
|
+
// There should be some overlap in the text
|
|
205
|
+
let overlap_found = chunk1_end.chars().any(|c| c != ' ') && chunk2_start.chars().any(|c| c != ' ');
|
|
206
|
+
|
|
207
|
+
assert!(
|
|
208
|
+
overlap_found,
|
|
209
|
+
"Adjacent chunks should have overlapping non-whitespace text"
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/// Test chunking disabled produces no chunks
|
|
216
|
+
#[tokio::test]
|
|
217
|
+
async fn test_chunking_disabled_produces_no_chunks() {
|
|
218
|
+
let long_text = "word ".repeat(500);
|
|
219
|
+
|
|
220
|
+
let config = ExtractionConfig {
|
|
221
|
+
chunking: None,
|
|
222
|
+
..Default::default()
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
|
|
226
|
+
.await
|
|
227
|
+
.expect("Should extract successfully");
|
|
228
|
+
|
|
229
|
+
assert!(result.chunks.is_none(), "Chunking disabled should produce no chunks");
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/// Test use_cache true allows results to be cached
|
|
233
|
+
#[tokio::test]
|
|
234
|
+
async fn test_cache_enabled_allows_caching() {
|
|
235
|
+
let text = "Test content for caching";
|
|
236
|
+
|
|
237
|
+
let config = ExtractionConfig {
|
|
238
|
+
use_cache: true,
|
|
239
|
+
..Default::default()
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
// Extract twice with same content
|
|
243
|
+
let result1 = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
244
|
+
.await
|
|
245
|
+
.expect("First extraction should succeed");
|
|
246
|
+
|
|
247
|
+
let result2 = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
248
|
+
.await
|
|
249
|
+
.expect("Second extraction should succeed");
|
|
250
|
+
|
|
251
|
+
// Results should be identical
|
|
252
|
+
assert_eq!(
|
|
253
|
+
result1.content, result2.content,
|
|
254
|
+
"Cache enabled should produce consistent results"
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/// Test use_cache false disables caching without crashing
|
|
259
|
+
#[tokio::test]
|
|
260
|
+
async fn test_cache_disabled_does_not_crash() {
|
|
261
|
+
let text = "Test content without caching";
|
|
262
|
+
|
|
263
|
+
let config = ExtractionConfig {
|
|
264
|
+
use_cache: false,
|
|
265
|
+
..Default::default()
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
269
|
+
.await
|
|
270
|
+
.expect("Extraction with cache disabled should succeed");
|
|
271
|
+
|
|
272
|
+
assert!(!result.content.is_empty(), "Should still extract content");
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/// Test quality_processing enabled produces quality score
|
|
276
|
+
#[tokio::test]
|
|
277
|
+
#[cfg(feature = "quality")]
|
|
278
|
+
async fn test_quality_processing_enabled_produces_score() {
|
|
279
|
+
let text = "This is a well-structured document. It has proper sentences. And good formatting.";
|
|
280
|
+
|
|
281
|
+
let config = ExtractionConfig {
|
|
282
|
+
enable_quality_processing: true,
|
|
283
|
+
..Default::default()
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
287
|
+
.await
|
|
288
|
+
.expect("Should extract successfully");
|
|
289
|
+
|
|
290
|
+
// Quality processing should add a quality_score to metadata
|
|
291
|
+
let has_quality_score = result.metadata.additional.contains_key("quality_score");
|
|
292
|
+
assert!(
|
|
293
|
+
has_quality_score,
|
|
294
|
+
"Quality processing enabled should produce quality_score in metadata"
|
|
295
|
+
);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Test quality_processing disabled does not produce score
|
|
299
|
+
#[tokio::test]
|
|
300
|
+
#[cfg(feature = "quality")]
|
|
301
|
+
async fn test_quality_processing_disabled_no_score() {
|
|
302
|
+
let text = "This is a document.";
|
|
303
|
+
|
|
304
|
+
let config = ExtractionConfig {
|
|
305
|
+
enable_quality_processing: false,
|
|
306
|
+
..Default::default()
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
310
|
+
.await
|
|
311
|
+
.expect("Should extract successfully");
|
|
312
|
+
|
|
313
|
+
assert!(
|
|
314
|
+
!result.metadata.additional.contains_key("quality_score"),
|
|
315
|
+
"Quality processing disabled should not produce quality_score"
|
|
316
|
+
);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/// Test output_format combinations with result_format
|
|
320
|
+
#[tokio::test]
|
|
321
|
+
async fn test_output_format_with_element_based() {
|
|
322
|
+
let html = b"<p>First paragraph</p><p>Second paragraph</p>";
|
|
323
|
+
|
|
324
|
+
let config = ExtractionConfig {
|
|
325
|
+
output_format: OutputFormat::Markdown,
|
|
326
|
+
result_format: ResultFormat::ElementBased,
|
|
327
|
+
..Default::default()
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
let result = extract_bytes(html, "text/html", &config)
|
|
331
|
+
.await
|
|
332
|
+
.expect("Should extract successfully");
|
|
333
|
+
|
|
334
|
+
// Should have elements
|
|
335
|
+
assert!(result.elements.is_some(), "ElementBased format should produce elements");
|
|
336
|
+
|
|
337
|
+
// Content should still be markdown formatted
|
|
338
|
+
assert!(
|
|
339
|
+
!result.content.contains("<p>"),
|
|
340
|
+
"Output format should not contain HTML tags"
|
|
341
|
+
);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
/// Test chunking respects overlap maximum
|
|
345
|
+
#[tokio::test]
|
|
346
|
+
#[cfg(feature = "chunking")]
|
|
347
|
+
async fn test_chunking_overlap_maximum() {
|
|
348
|
+
let text = "x".repeat(200); // Simple repeated character
|
|
349
|
+
|
|
350
|
+
let config = ExtractionConfig {
|
|
351
|
+
chunking: Some(ChunkingConfig {
|
|
352
|
+
max_chars: 60,
|
|
353
|
+
max_overlap: 10,
|
|
354
|
+
embedding: None,
|
|
355
|
+
preset: None,
|
|
356
|
+
}),
|
|
357
|
+
..Default::default()
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
|
|
361
|
+
.await
|
|
362
|
+
.expect("Should extract successfully");
|
|
363
|
+
|
|
364
|
+
if let Some(chunks) = result.chunks {
|
|
365
|
+
// Verify max_overlap is not exceeded
|
|
366
|
+
for (i, chunk) in chunks.iter().enumerate() {
|
|
367
|
+
assert!(
|
|
368
|
+
chunk.content.len() <= 60 + 10,
|
|
369
|
+
"Chunk {} size {} exceeds max_chars (60) + max_overlap (10)",
|
|
370
|
+
i,
|
|
371
|
+
chunk.content.len()
|
|
372
|
+
);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/// Test large document extraction with multiple config options
|
|
378
|
+
#[tokio::test]
|
|
379
|
+
#[cfg(feature = "chunking")]
|
|
380
|
+
async fn test_large_document_with_combined_config() {
|
|
381
|
+
let large_text = "This is a paragraph. ".repeat(100); // ~2000 characters
|
|
382
|
+
|
|
383
|
+
let config = ExtractionConfig {
|
|
384
|
+
output_format: OutputFormat::Plain,
|
|
385
|
+
chunking: Some(ChunkingConfig {
|
|
386
|
+
max_chars: 200,
|
|
387
|
+
max_overlap: 30,
|
|
388
|
+
embedding: None,
|
|
389
|
+
preset: None,
|
|
390
|
+
}),
|
|
391
|
+
use_cache: true,
|
|
392
|
+
enable_quality_processing: true,
|
|
393
|
+
..Default::default()
|
|
394
|
+
};
|
|
395
|
+
|
|
396
|
+
let result = extract_bytes(large_text.as_bytes(), "text/plain", &config)
|
|
397
|
+
.await
|
|
398
|
+
.expect("Should extract successfully");
|
|
399
|
+
|
|
400
|
+
// Should have chunks due to size
|
|
401
|
+
assert!(result.chunks.is_some(), "Should produce chunks for large text");
|
|
402
|
+
|
|
403
|
+
// Should have quality score
|
|
404
|
+
#[cfg(feature = "quality")]
|
|
405
|
+
{
|
|
406
|
+
assert!(
|
|
407
|
+
result.metadata.additional.contains_key("quality_score"),
|
|
408
|
+
"Should have quality score"
|
|
409
|
+
);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Should have content in plain format
|
|
413
|
+
assert!(!result.content.is_empty(), "Should have content");
|
|
414
|
+
}
|
|
@@ -35,14 +35,14 @@ async fn test_chunking_enabled() {
|
|
|
35
35
|
.expect("Should extract successfully");
|
|
36
36
|
|
|
37
37
|
assert!(result.chunks.is_some(), "Chunks should be present");
|
|
38
|
-
let chunks = result.chunks.
|
|
38
|
+
let chunks = result.chunks.expect("Operation failed");
|
|
39
39
|
assert!(chunks.len() > 1, "Should have multiple chunks");
|
|
40
40
|
|
|
41
41
|
assert!(result.metadata.additional.contains_key("chunk_count"));
|
|
42
|
-
let chunk_count = result.metadata.additional.get("chunk_count").
|
|
42
|
+
let chunk_count = result.metadata.additional.get("chunk_count").expect("Value not found");
|
|
43
43
|
assert_eq!(
|
|
44
44
|
chunks.len(),
|
|
45
|
-
chunk_count.as_u64().
|
|
45
|
+
chunk_count.as_u64().expect("Operation failed") as usize,
|
|
46
46
|
"Chunks length should match chunk_count metadata"
|
|
47
47
|
);
|
|
48
48
|
|
|
@@ -78,7 +78,7 @@ async fn test_chunking_with_overlap() {
|
|
|
78
78
|
.expect("Should extract successfully");
|
|
79
79
|
|
|
80
80
|
assert!(result.chunks.is_some(), "Chunks should be present");
|
|
81
|
-
let chunks = result.chunks.
|
|
81
|
+
let chunks = result.chunks.expect("Operation failed");
|
|
82
82
|
assert!(chunks.len() >= 2, "Should have at least 2 chunks");
|
|
83
83
|
|
|
84
84
|
assert!(result.metadata.additional.contains_key("chunk_count"));
|
|
@@ -118,7 +118,7 @@ async fn test_chunking_custom_sizes() {
|
|
|
118
118
|
.expect("Should extract successfully");
|
|
119
119
|
|
|
120
120
|
assert!(result.chunks.is_some(), "Chunks should be present");
|
|
121
|
-
let chunks = result.chunks.
|
|
121
|
+
let chunks = result.chunks.expect("Operation failed");
|
|
122
122
|
assert!(!chunks.is_empty(), "Should have at least 1 chunk");
|
|
123
123
|
|
|
124
124
|
assert!(result.metadata.additional.contains_key("chunk_count"));
|
|
@@ -178,7 +178,7 @@ async fn test_language_detection_single() {
|
|
|
178
178
|
.expect("Should extract successfully");
|
|
179
179
|
|
|
180
180
|
assert!(result.detected_languages.is_some(), "Should detect language");
|
|
181
|
-
let languages = result.detected_languages.
|
|
181
|
+
let languages = result.detected_languages.expect("Operation failed");
|
|
182
182
|
assert!(!languages.is_empty(), "Should detect at least one language");
|
|
183
183
|
assert_eq!(languages[0], "eng", "Should detect English");
|
|
184
184
|
}
|
|
@@ -205,7 +205,7 @@ async fn test_language_detection_multiple() {
|
|
|
205
205
|
.expect("Should extract successfully");
|
|
206
206
|
|
|
207
207
|
assert!(result.detected_languages.is_some(), "Should detect languages");
|
|
208
|
-
let languages = result.detected_languages.
|
|
208
|
+
let languages = result.detected_languages.expect("Operation failed");
|
|
209
209
|
assert!(!languages.is_empty(), "Should detect at least one language");
|
|
210
210
|
}
|
|
211
211
|
|
|
@@ -424,7 +424,7 @@ async fn test_quality_processing_enabled() {
|
|
|
424
424
|
.expect("Should extract successfully");
|
|
425
425
|
|
|
426
426
|
if let Some(score) = result.metadata.additional.get("quality_score") {
|
|
427
|
-
let score_value = score.as_f64().
|
|
427
|
+
let score_value = score.as_f64().expect("Operation failed");
|
|
428
428
|
assert!((0.0..=1.0).contains(&score_value));
|
|
429
429
|
}
|
|
430
430
|
|
|
@@ -463,16 +463,16 @@ async fn test_quality_threshold_filtering() {
|
|
|
463
463
|
.metadata
|
|
464
464
|
.additional
|
|
465
465
|
.get("quality_score")
|
|
466
|
-
.
|
|
466
|
+
.expect("Operation failed")
|
|
467
467
|
.as_f64()
|
|
468
|
-
.
|
|
468
|
+
.expect("Operation failed");
|
|
469
469
|
let score_low = result_low
|
|
470
470
|
.metadata
|
|
471
471
|
.additional
|
|
472
472
|
.get("quality_score")
|
|
473
|
-
.
|
|
473
|
+
.expect("Operation failed")
|
|
474
474
|
.as_f64()
|
|
475
|
-
.
|
|
475
|
+
.expect("Operation failed");
|
|
476
476
|
|
|
477
477
|
assert!((0.0..=1.0).contains(&score_high));
|
|
478
478
|
assert!((0.0..=1.0).contains(&score_low));
|
|
@@ -528,7 +528,7 @@ async fn test_chunking_with_embeddings() {
|
|
|
528
528
|
.expect("Should extract successfully");
|
|
529
529
|
|
|
530
530
|
assert!(result.chunks.is_some(), "Chunks should be present");
|
|
531
|
-
let chunks = result.chunks.
|
|
531
|
+
let chunks = result.chunks.expect("Operation failed");
|
|
532
532
|
assert!(chunks.len() > 1, "Should have multiple chunks");
|
|
533
533
|
|
|
534
534
|
println!("Metadata: {:?}", result.metadata.additional);
|
|
@@ -542,13 +542,17 @@ async fn test_chunking_with_embeddings() {
|
|
|
542
542
|
"Should have embeddings_generated metadata"
|
|
543
543
|
);
|
|
544
544
|
assert_eq!(
|
|
545
|
-
result
|
|
545
|
+
result
|
|
546
|
+
.metadata
|
|
547
|
+
.additional
|
|
548
|
+
.get("embeddings_generated")
|
|
549
|
+
.expect("Value not found"),
|
|
546
550
|
&serde_json::Value::Bool(true)
|
|
547
551
|
);
|
|
548
552
|
|
|
549
553
|
for chunk in &chunks {
|
|
550
554
|
assert!(chunk.embedding.is_some(), "Each chunk should have an embedding");
|
|
551
|
-
let embedding = chunk.embedding.as_ref().
|
|
555
|
+
let embedding = chunk.embedding.as_ref().expect("Operation failed");
|
|
552
556
|
assert_eq!(
|
|
553
557
|
embedding.len(),
|
|
554
558
|
768,
|