kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -12,18 +12,6 @@ use std::path::PathBuf;
|
|
|
12
12
|
mod helpers;
|
|
13
13
|
use helpers::{get_test_documents_dir, get_test_file_path, skip_if_missing, test_documents_available};
|
|
14
14
|
|
|
15
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
16
|
-
value.trim_end_matches(['\n', '\r'])
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
20
|
-
assert_eq!(
|
|
21
|
-
trim_trailing_newlines(actual),
|
|
22
|
-
expected,
|
|
23
|
-
"Content mismatch after trimming trailing newlines"
|
|
24
|
-
);
|
|
25
|
-
}
|
|
26
|
-
|
|
27
15
|
/// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
|
|
28
16
|
#[tokio::test]
|
|
29
17
|
async fn test_batch_extract_file_multiple_formats() {
|
|
@@ -134,7 +122,7 @@ async fn test_batch_extract_bytes_multiple() {
|
|
|
134
122
|
|
|
135
123
|
assert_eq!(results.len(), 3);
|
|
136
124
|
|
|
137
|
-
|
|
125
|
+
assert_eq!(results[0].content, "This is plain text content");
|
|
138
126
|
assert_eq!(results[0].mime_type, "text/plain");
|
|
139
127
|
|
|
140
128
|
assert!(results[1].content.contains("Markdown Header"));
|
|
@@ -310,7 +298,7 @@ fn test_batch_extract_bytes_sync_variant() {
|
|
|
310
298
|
let results = results.unwrap();
|
|
311
299
|
|
|
312
300
|
assert_eq!(results.len(), 3);
|
|
313
|
-
|
|
314
|
-
|
|
301
|
+
assert_eq!(results[0].content, "content 1");
|
|
302
|
+
assert_eq!(results[1].content, "content 2");
|
|
315
303
|
assert!(results[2].content.contains("content 3"));
|
|
316
304
|
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#[cfg(feature = "chunking")]
|
|
2
|
+
#[test]
|
|
3
|
+
fn demonstrate_correct_offset_calculation() {
|
|
4
|
+
use kreuzberg::chunking::{ChunkerType, ChunkingConfig, chunk_text};
|
|
5
|
+
|
|
6
|
+
println!("\n=== Demonstrating Correct Chunking Offset Calculation ===\n");
|
|
7
|
+
|
|
8
|
+
let config_with_overlap = ChunkingConfig {
|
|
9
|
+
max_characters: 20,
|
|
10
|
+
overlap: 5,
|
|
11
|
+
trim: false,
|
|
12
|
+
chunker_type: ChunkerType::Text,
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
16
|
+
println!("Text: \"{}\"", text);
|
|
17
|
+
println!(
|
|
18
|
+
"Max characters: {}, Overlap: {}\n",
|
|
19
|
+
config_with_overlap.max_characters, config_with_overlap.overlap
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
let result = chunk_text(text, &config_with_overlap).unwrap();
|
|
23
|
+
|
|
24
|
+
println!("WITH OVERLAP (5 chars):");
|
|
25
|
+
for (i, chunk) in result.chunks.iter().enumerate() {
|
|
26
|
+
println!(
|
|
27
|
+
" Chunk {}: [{:3} - {:3}] = \"{}\"",
|
|
28
|
+
i,
|
|
29
|
+
chunk.metadata.char_start,
|
|
30
|
+
chunk.metadata.char_end,
|
|
31
|
+
chunk.content.replace('\n', "\\n")
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
println!("\nOverlap verification:");
|
|
36
|
+
for i in 0..result.chunks.len() - 1 {
|
|
37
|
+
let current = &result.chunks[i];
|
|
38
|
+
let next = &result.chunks[i + 1];
|
|
39
|
+
let overlap_size = current.metadata.char_end - next.metadata.char_start;
|
|
40
|
+
println!(
|
|
41
|
+
" Chunks {} and {}: overlap = {} chars (next starts at {} while current ends at {})",
|
|
42
|
+
i,
|
|
43
|
+
i + 1,
|
|
44
|
+
overlap_size,
|
|
45
|
+
next.metadata.char_start,
|
|
46
|
+
current.metadata.char_end
|
|
47
|
+
);
|
|
48
|
+
assert!(
|
|
49
|
+
overlap_size > 0 && overlap_size <= config_with_overlap.overlap + 10,
|
|
50
|
+
"Overlap should exist and be reasonable"
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
println!("\n\n=== Without Overlap ===\n");
|
|
55
|
+
let config_no_overlap = ChunkingConfig {
|
|
56
|
+
max_characters: 20,
|
|
57
|
+
overlap: 0,
|
|
58
|
+
trim: false,
|
|
59
|
+
chunker_type: ChunkerType::Text,
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
let result_no_overlap = chunk_text(text, &config_no_overlap).unwrap();
|
|
63
|
+
|
|
64
|
+
println!("WITHOUT OVERLAP:");
|
|
65
|
+
for (i, chunk) in result_no_overlap.chunks.iter().enumerate() {
|
|
66
|
+
println!(
|
|
67
|
+
" Chunk {}: [{:3} - {:3}] = \"{}\"",
|
|
68
|
+
i,
|
|
69
|
+
chunk.metadata.char_start,
|
|
70
|
+
chunk.metadata.char_end,
|
|
71
|
+
chunk.content.replace('\n', "\\n")
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
println!("\nAdjacency verification:");
|
|
76
|
+
for i in 0..result_no_overlap.chunks.len() - 1 {
|
|
77
|
+
let current = &result_no_overlap.chunks[i];
|
|
78
|
+
let next = &result_no_overlap.chunks[i + 1];
|
|
79
|
+
let gap = next.metadata.char_start as i32 - current.metadata.char_end as i32;
|
|
80
|
+
println!(
|
|
81
|
+
" Chunks {} and {}: gap = {} (next starts at {}, current ends at {})",
|
|
82
|
+
i,
|
|
83
|
+
i + 1,
|
|
84
|
+
gap,
|
|
85
|
+
next.metadata.char_start,
|
|
86
|
+
current.metadata.char_end
|
|
87
|
+
);
|
|
88
|
+
assert!(gap >= 0, "Should have no overlap (gap >= 0)");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
println!("\n✓ All offset calculations are correct!");
|
|
92
|
+
}
|
|
@@ -30,18 +30,6 @@ use tokio::time::timeout;
|
|
|
30
30
|
|
|
31
31
|
mod helpers;
|
|
32
32
|
|
|
33
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
34
|
-
value.trim_end_matches(['\n', '\r'])
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
38
|
-
assert_eq!(
|
|
39
|
-
trim_trailing_newlines(actual),
|
|
40
|
-
expected,
|
|
41
|
-
"Content mismatch after trimming trailing newlines"
|
|
42
|
-
);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
33
|
/// Test many concurrent extractions of different MIME types.
|
|
46
34
|
///
|
|
47
35
|
/// Validates that:
|
|
@@ -156,7 +144,7 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
156
144
|
let result = handle.await.expect("Task should not panic");
|
|
157
145
|
assert!(result.is_ok(), "Cache read should succeed");
|
|
158
146
|
let extraction = result.unwrap();
|
|
159
|
-
|
|
147
|
+
assert_eq!(extraction.content, expected_content);
|
|
160
148
|
}
|
|
161
149
|
}
|
|
162
150
|
|
|
@@ -171,10 +159,6 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
171
159
|
async fn test_concurrent_ocr_processing() {
|
|
172
160
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
173
161
|
|
|
174
|
-
if cfg!(windows) {
|
|
175
|
-
return;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
162
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
179
163
|
tracing::debug!("Skipping concurrent OCR test: test file not available");
|
|
180
164
|
return;
|
|
@@ -484,15 +484,8 @@ async fn test_quality_processing_disabled() {
|
|
|
484
484
|
}
|
|
485
485
|
|
|
486
486
|
/// Test chunking with embeddings using balanced preset.
|
|
487
|
-
///
|
|
488
|
-
/// This test requires ONNX Runtime to be installed as a system dependency.
|
|
489
|
-
/// On macOS with Homebrew: `brew install onnxruntime`
|
|
490
|
-
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
|
|
491
|
-
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
|
|
492
487
|
#[tokio::test]
|
|
493
488
|
#[cfg(feature = "embeddings")]
|
|
494
|
-
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
|
|
495
|
-
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
|
|
496
489
|
async fn test_chunking_with_embeddings() {
|
|
497
490
|
use kreuzberg::core::config::EmbeddingConfig;
|
|
498
491
|
|
|
@@ -550,15 +543,8 @@ async fn test_chunking_with_embeddings() {
|
|
|
550
543
|
}
|
|
551
544
|
|
|
552
545
|
/// Test chunking with fast embedding preset.
|
|
553
|
-
///
|
|
554
|
-
/// This test requires ONNX Runtime to be installed as a system dependency.
|
|
555
|
-
/// On macOS with Homebrew: `brew install onnxruntime`
|
|
556
|
-
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
|
|
557
|
-
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
|
|
558
546
|
#[tokio::test]
|
|
559
547
|
#[cfg(feature = "embeddings")]
|
|
560
|
-
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
|
|
561
|
-
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
|
|
562
548
|
async fn test_chunking_with_fast_embeddings() {
|
|
563
549
|
use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};
|
|
564
550
|
|
|
@@ -587,10 +573,6 @@ async fn test_chunking_with_fast_embeddings() {
|
|
|
587
573
|
let chunks = result.chunks.expect("Should have chunks");
|
|
588
574
|
assert!(!chunks.is_empty(), "Should have at least one chunk");
|
|
589
575
|
|
|
590
|
-
if let Some(error) = result.metadata.additional.get("embedding_error") {
|
|
591
|
-
panic!("Embedding generation failed: {}", error);
|
|
592
|
-
}
|
|
593
|
-
|
|
594
576
|
for chunk in &chunks {
|
|
595
577
|
let embedding = chunk.embedding.as_ref().expect("Should have embedding");
|
|
596
578
|
assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");
|
|
@@ -124,6 +124,7 @@ ocr:
|
|
|
124
124
|
fn test_from_file_nonexistent_path_fails() {
|
|
125
125
|
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
|
|
126
126
|
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
|
|
127
|
+
// Error can be Io or other types depending on the implementation
|
|
127
128
|
}
|
|
128
129
|
|
|
129
130
|
/// Test from_file with malformed TOML fails.
|
|
@@ -141,6 +142,7 @@ enabled = true
|
|
|
141
142
|
|
|
142
143
|
let result = ExtractionConfig::from_file(&config_path);
|
|
143
144
|
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
|
|
145
|
+
// Error handling varies - just ensure it failed
|
|
144
146
|
}
|
|
145
147
|
|
|
146
148
|
/// Test from_file with malformed JSON fails.
|
|
@@ -162,6 +164,7 @@ fn test_from_file_malformed_json_fails() {
|
|
|
162
164
|
|
|
163
165
|
let result = ExtractionConfig::from_file(&config_path);
|
|
164
166
|
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
|
|
167
|
+
// Error handling varies - just ensure it failed
|
|
165
168
|
}
|
|
166
169
|
|
|
167
170
|
/// Test from_file with malformed YAML fails.
|
|
@@ -180,6 +183,7 @@ ocr:
|
|
|
180
183
|
|
|
181
184
|
let result = ExtractionConfig::from_file(&config_path);
|
|
182
185
|
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
|
|
186
|
+
// Error handling varies - just ensure it failed
|
|
183
187
|
}
|
|
184
188
|
|
|
185
189
|
/// Test from_file with empty file uses defaults.
|
|
@@ -194,6 +198,7 @@ fn test_from_file_empty_file_uses_defaults() {
|
|
|
194
198
|
assert!(config.is_ok(), "Should load empty file successfully");
|
|
195
199
|
|
|
196
200
|
let config = config.unwrap();
|
|
201
|
+
// Should have default values
|
|
197
202
|
assert!(config.ocr.is_none(), "Default config should have no OCR");
|
|
198
203
|
assert!(config.chunking.is_none(), "Default config should have no chunking");
|
|
199
204
|
}
|
|
@@ -209,18 +214,22 @@ fn test_from_file_unsupported_extension_fails() {
|
|
|
209
214
|
let result = ExtractionConfig::from_file(&config_path);
|
|
210
215
|
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
|
|
211
216
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
217
|
+
match result {
|
|
218
|
+
Err(KreuzbergError::Validation { message, .. }) => {
|
|
219
|
+
assert!(
|
|
220
|
+
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
|
|
221
|
+
"Error should mention format/extension: {}",
|
|
222
|
+
message
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
_ => {
|
|
226
|
+
// Some other error is also acceptable
|
|
227
|
+
}
|
|
218
228
|
}
|
|
219
229
|
}
|
|
220
230
|
|
|
221
231
|
/// Test discover() finds config in current directory.
|
|
222
232
|
#[test]
|
|
223
|
-
#[serial_test::serial]
|
|
224
233
|
fn test_discover_finds_config_in_current_dir() {
|
|
225
234
|
let temp_dir = TempDir::new().unwrap();
|
|
226
235
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
@@ -232,11 +241,13 @@ enabled = true
|
|
|
232
241
|
|
|
233
242
|
fs::write(&config_path, toml_content).unwrap();
|
|
234
243
|
|
|
244
|
+
// Change to temp directory
|
|
235
245
|
let original_dir = std::env::current_dir().unwrap();
|
|
236
246
|
std::env::set_current_dir(temp_dir.path()).unwrap();
|
|
237
247
|
|
|
238
248
|
let result = ExtractionConfig::discover();
|
|
239
249
|
|
|
250
|
+
// Restore original directory
|
|
240
251
|
std::env::set_current_dir(original_dir).unwrap();
|
|
241
252
|
|
|
242
253
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -247,7 +258,6 @@ enabled = true
|
|
|
247
258
|
|
|
248
259
|
/// Test discover() finds config in parent directory.
|
|
249
260
|
#[test]
|
|
250
|
-
#[serial_test::serial]
|
|
251
261
|
fn test_discover_finds_config_in_parent_dir() {
|
|
252
262
|
let temp_dir = TempDir::new().unwrap();
|
|
253
263
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
@@ -259,14 +269,17 @@ enabled = true
|
|
|
259
269
|
|
|
260
270
|
fs::write(&config_path, toml_content).unwrap();
|
|
261
271
|
|
|
272
|
+
// Create subdirectory
|
|
262
273
|
let sub_dir = temp_dir.path().join("subdir");
|
|
263
274
|
fs::create_dir(&sub_dir).unwrap();
|
|
264
275
|
|
|
276
|
+
// Change to subdirectory
|
|
265
277
|
let original_dir = std::env::current_dir().unwrap();
|
|
266
278
|
std::env::set_current_dir(&sub_dir).unwrap();
|
|
267
279
|
|
|
268
280
|
let result = ExtractionConfig::discover();
|
|
269
281
|
|
|
282
|
+
// Restore original directory
|
|
270
283
|
std::env::set_current_dir(original_dir).unwrap();
|
|
271
284
|
|
|
272
285
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -277,39 +290,44 @@ enabled = true
|
|
|
277
290
|
|
|
278
291
|
/// Test discover() returns None when no config found.
|
|
279
292
|
#[test]
|
|
280
|
-
#[serial_test::serial]
|
|
281
293
|
fn test_discover_returns_none_when_not_found() {
|
|
282
294
|
let temp_dir = TempDir::new().unwrap();
|
|
283
295
|
let sub_dir = temp_dir.path().join("subdir");
|
|
284
296
|
fs::create_dir(&sub_dir).unwrap();
|
|
285
297
|
|
|
298
|
+
// Change to subdirectory (no config files)
|
|
286
299
|
let original_dir = std::env::current_dir().unwrap();
|
|
287
300
|
std::env::set_current_dir(&sub_dir).unwrap();
|
|
288
301
|
|
|
289
302
|
let result = ExtractionConfig::discover();
|
|
290
303
|
|
|
304
|
+
// Restore original directory
|
|
291
305
|
std::env::set_current_dir(original_dir).unwrap();
|
|
292
306
|
|
|
293
307
|
assert!(result.is_ok(), "Discover should succeed even when no config found");
|
|
294
308
|
let _config = result.unwrap();
|
|
309
|
+
// May return None or may find a config in parent directories (e.g., repository root)
|
|
310
|
+
// Just verify it doesn't error - the specific behavior depends on the directory structure
|
|
295
311
|
}
|
|
296
312
|
|
|
297
313
|
/// Test discover() prefers certain file names.
|
|
298
314
|
#[test]
|
|
299
|
-
#[serial_test::serial]
|
|
300
315
|
fn test_discover_file_name_preference() {
|
|
301
316
|
let temp_dir = TempDir::new().unwrap();
|
|
302
317
|
|
|
318
|
+
// Create multiple config files
|
|
303
319
|
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
|
|
304
320
|
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
|
|
305
321
|
|
|
306
322
|
let original_dir = std::env::current_dir().unwrap();
|
|
307
323
|
if std::env::set_current_dir(temp_dir.path()).is_err() {
|
|
324
|
+
// Skip this test if we can't change directory
|
|
308
325
|
return;
|
|
309
326
|
}
|
|
310
327
|
|
|
311
328
|
let result = ExtractionConfig::discover();
|
|
312
329
|
|
|
330
|
+
// Always restore directory even if test fails
|
|
313
331
|
let _ = std::env::set_current_dir(original_dir);
|
|
314
332
|
|
|
315
333
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -319,7 +337,6 @@ fn test_discover_file_name_preference() {
|
|
|
319
337
|
|
|
320
338
|
/// Test discover() with nested directories.
|
|
321
339
|
#[test]
|
|
322
|
-
#[serial_test::serial]
|
|
323
340
|
fn test_discover_with_nested_directories() {
|
|
324
341
|
let temp_dir = TempDir::new().unwrap();
|
|
325
342
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
@@ -331,18 +348,22 @@ enabled = true
|
|
|
331
348
|
|
|
332
349
|
fs::write(&config_path, toml_content).unwrap();
|
|
333
350
|
|
|
351
|
+
// Create nested subdirectories
|
|
334
352
|
let level1 = temp_dir.path().join("level1");
|
|
335
353
|
let level2 = level1.join("level2");
|
|
336
354
|
let level3 = level2.join("level3");
|
|
337
355
|
fs::create_dir_all(&level3).unwrap();
|
|
338
356
|
|
|
357
|
+
// Change to deepest directory
|
|
339
358
|
let original_dir = std::env::current_dir().unwrap();
|
|
340
359
|
if std::env::set_current_dir(&level3).is_err() {
|
|
360
|
+
// Skip this test if we can't change directory
|
|
341
361
|
return;
|
|
342
362
|
}
|
|
343
363
|
|
|
344
364
|
let result = ExtractionConfig::discover();
|
|
345
365
|
|
|
366
|
+
// Always restore directory even if test fails
|
|
346
367
|
let _ = std::env::set_current_dir(&original_dir);
|
|
347
368
|
|
|
348
369
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -398,6 +419,7 @@ fn test_from_file_with_invalid_values() {
|
|
|
398
419
|
let temp_dir = TempDir::new().unwrap();
|
|
399
420
|
let config_path = temp_dir.path().join("config.toml");
|
|
400
421
|
|
|
422
|
+
// Negative values should be rejected during deserialization or validation
|
|
401
423
|
let toml_content = r#"
|
|
402
424
|
[chunking]
|
|
403
425
|
max_chars = -1000
|
|
@@ -407,9 +429,11 @@ max_overlap = -100
|
|
|
407
429
|
fs::write(&config_path, toml_content).unwrap();
|
|
408
430
|
|
|
409
431
|
let result = ExtractionConfig::from_file(&config_path);
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
432
|
+
// Should either fail parsing or have clamped values
|
|
433
|
+
if let Ok(config) = result {
|
|
434
|
+
// If it succeeds, values should be reasonable
|
|
435
|
+
if let Some(chunking) = config.chunking {
|
|
436
|
+
assert!(chunking.max_chars > 0, "max_chars should be positive");
|
|
437
|
+
}
|
|
414
438
|
}
|
|
415
439
|
}
|
|
@@ -11,18 +11,6 @@ use std::fs::{self, File};
|
|
|
11
11
|
use std::io::Write;
|
|
12
12
|
use tempfile::tempdir;
|
|
13
13
|
|
|
14
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
15
|
-
value.trim_end_matches(['\n', '\r'])
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
19
|
-
assert_eq!(
|
|
20
|
-
trim_trailing_newlines(actual),
|
|
21
|
-
expected,
|
|
22
|
-
"Content mismatch after trimming trailing newlines"
|
|
23
|
-
);
|
|
24
|
-
}
|
|
25
|
-
|
|
26
14
|
/// Test basic file extraction with MIME detection.
|
|
27
15
|
#[tokio::test]
|
|
28
16
|
async fn test_extract_file_basic() {
|
|
@@ -37,7 +25,7 @@ async fn test_extract_file_basic() {
|
|
|
37
25
|
assert!(result.is_ok(), "Basic file extraction should succeed");
|
|
38
26
|
let result = result.unwrap();
|
|
39
27
|
|
|
40
|
-
|
|
28
|
+
assert_eq!(result.content, "Hello, Kreuzberg!");
|
|
41
29
|
assert_eq!(result.mime_type, "text/plain");
|
|
42
30
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
43
31
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
@@ -193,12 +181,7 @@ async fn test_batch_extract_bytes_concurrency() {
|
|
|
193
181
|
|
|
194
182
|
for (i, result) in results.iter().enumerate() {
|
|
195
183
|
let expected_content = format!("content {}", i + 1);
|
|
196
|
-
assert_eq!(
|
|
197
|
-
trim_trailing_newlines(&result.content),
|
|
198
|
-
expected_content,
|
|
199
|
-
"Content mismatch for item {}",
|
|
200
|
-
i
|
|
201
|
-
);
|
|
184
|
+
assert_eq!(result.content, expected_content, "Content mismatch for item {}", i);
|
|
202
185
|
assert_eq!(result.mime_type, "text/plain", "MIME type should be text/plain");
|
|
203
186
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
204
187
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
@@ -218,13 +201,13 @@ fn test_sync_wrappers() {
|
|
|
218
201
|
let result = extract_file_sync(&file_path, None, &config);
|
|
219
202
|
assert!(result.is_ok(), "Sync file extraction should succeed");
|
|
220
203
|
let extraction = result.unwrap();
|
|
221
|
-
|
|
204
|
+
assert_eq!(extraction.content, "sync content");
|
|
222
205
|
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
223
206
|
|
|
224
207
|
let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
|
|
225
208
|
assert!(result.is_ok(), "Sync bytes extraction should succeed");
|
|
226
209
|
let extraction = result.unwrap();
|
|
227
|
-
|
|
210
|
+
assert_eq!(extraction.content, "test bytes");
|
|
228
211
|
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
229
212
|
|
|
230
213
|
let paths = vec![file_path];
|
|
@@ -232,7 +215,7 @@ fn test_sync_wrappers() {
|
|
|
232
215
|
assert!(results.is_ok(), "Batch sync file should succeed");
|
|
233
216
|
let results = results.unwrap();
|
|
234
217
|
assert_eq!(results.len(), 1);
|
|
235
|
-
|
|
218
|
+
assert_eq!(results[0].content, "sync content");
|
|
236
219
|
assert!(results[0].chunks.is_none(), "Chunks should be None");
|
|
237
220
|
|
|
238
221
|
let contents = vec![(b"test".as_slice(), "text/plain")];
|
|
@@ -240,7 +223,7 @@ fn test_sync_wrappers() {
|
|
|
240
223
|
assert!(results.is_ok(), "Batch bytes sync should succeed");
|
|
241
224
|
let results = results.unwrap();
|
|
242
225
|
assert_eq!(results.len(), 1);
|
|
243
|
-
|
|
226
|
+
assert_eq!(results[0].content, "test");
|
|
244
227
|
assert!(results[0].chunks.is_none(), "Chunks should be None");
|
|
245
228
|
}
|
|
246
229
|
|
|
@@ -432,7 +415,7 @@ async fn test_pipeline_execution() {
|
|
|
432
415
|
assert!(result.is_ok(), "Pipeline execution should succeed");
|
|
433
416
|
|
|
434
417
|
let result = result.unwrap();
|
|
435
|
-
|
|
418
|
+
assert_eq!(result.content, "pipeline content");
|
|
436
419
|
assert_eq!(result.mime_type, "text/plain");
|
|
437
420
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
438
421
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|