kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
//! allowing us to reuse the existing table reconstruction logic.
|
|
5
5
|
|
|
6
6
|
use super::error::{PdfError, Result};
|
|
7
|
-
use
|
|
7
|
+
use crate::ocr::table::HocrWord;
|
|
8
8
|
use pdfium_render::prelude::*;
|
|
9
9
|
|
|
10
10
|
/// Spacing threshold for word boundary detection (in PDF units).
|
|
@@ -35,24 +35,25 @@ const MIN_WORD_LENGTH: usize = 1;
|
|
|
35
35
|
/// use kreuzberg::pdf::table::extract_words_from_page;
|
|
36
36
|
/// use pdfium_render::prelude::*;
|
|
37
37
|
///
|
|
38
|
-
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
39
38
|
/// let pdfium = Pdfium::default();
|
|
40
39
|
/// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
|
|
41
40
|
/// let page = document.pages().get(0)?;
|
|
42
41
|
/// let words = extract_words_from_page(&page, 90.0)?;
|
|
43
|
-
/// # Ok(())
|
|
44
|
-
/// # }
|
|
45
42
|
/// ```
|
|
46
43
|
pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
|
|
44
|
+
// Get page dimensions for coordinate system
|
|
47
45
|
let page_width = page.width().value as i32;
|
|
48
46
|
let page_height = page.height().value as i32;
|
|
49
47
|
|
|
48
|
+
// Get all text from page
|
|
50
49
|
let page_text = page
|
|
51
50
|
.text()
|
|
52
51
|
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
|
|
53
52
|
|
|
53
|
+
// Extract character-level information
|
|
54
54
|
let chars = page_text.chars();
|
|
55
55
|
|
|
56
|
+
// Group characters into words based on spacing
|
|
56
57
|
let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
|
|
57
58
|
|
|
58
59
|
Ok(words)
|
|
@@ -90,22 +91,26 @@ fn group_chars_into_words(
|
|
|
90
91
|
let mut current_word_chars: Vec<CharInfo> = Vec::new();
|
|
91
92
|
|
|
92
93
|
for pdf_char in chars.iter() {
|
|
94
|
+
// Get character bounds (use loose_bounds for table detection)
|
|
93
95
|
let bounds = pdf_char
|
|
94
96
|
.loose_bounds()
|
|
95
97
|
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
|
|
96
98
|
|
|
99
|
+
// Get unicode character (skip if invalid)
|
|
97
100
|
let Some(ch) = pdf_char.unicode_char() else {
|
|
98
101
|
continue;
|
|
99
102
|
};
|
|
100
103
|
|
|
104
|
+
// Extract character information
|
|
101
105
|
let char_info = CharInfo {
|
|
102
106
|
text: ch,
|
|
103
107
|
x: bounds.left().value,
|
|
104
|
-
y: bounds.bottom().value,
|
|
108
|
+
y: bounds.bottom().value, // PDF coordinates: bottom-left origin
|
|
105
109
|
width: bounds.width().value,
|
|
106
110
|
height: bounds.height().value,
|
|
107
111
|
};
|
|
108
112
|
|
|
113
|
+
// Skip whitespace characters (they're used for word boundaries)
|
|
109
114
|
if char_info.text.is_whitespace() {
|
|
110
115
|
if !current_word_chars.is_empty() {
|
|
111
116
|
if let Some(word) = finalize_word(¤t_word_chars, page_height, min_confidence) {
|
|
@@ -116,6 +121,7 @@ fn group_chars_into_words(
|
|
|
116
121
|
continue;
|
|
117
122
|
}
|
|
118
123
|
|
|
124
|
+
// Check if this character should start a new word
|
|
119
125
|
if should_start_new_word(¤t_word_chars, &char_info) && !current_word_chars.is_empty() {
|
|
120
126
|
if let Some(word) = finalize_word(¤t_word_chars, page_height, min_confidence) {
|
|
121
127
|
words.push(word);
|
|
@@ -126,10 +132,11 @@ fn group_chars_into_words(
|
|
|
126
132
|
current_word_chars.push(char_info);
|
|
127
133
|
}
|
|
128
134
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
135
|
+
// Finalize last word
|
|
136
|
+
if !current_word_chars.is_empty() {
|
|
137
|
+
if let Some(word) = finalize_word(¤t_word_chars, page_height, min_confidence) {
|
|
138
|
+
words.push(word);
|
|
139
|
+
}
|
|
133
140
|
}
|
|
134
141
|
|
|
135
142
|
Ok(words)
|
|
@@ -146,11 +153,13 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
|
|
|
146
153
|
|
|
147
154
|
let last_char = ¤t_word_chars[current_word_chars.len() - 1];
|
|
148
155
|
|
|
156
|
+
// Check vertical distance (different lines)
|
|
149
157
|
let vertical_distance = (new_char.y - last_char.y).abs();
|
|
150
158
|
if vertical_distance > last_char.height * 0.5 {
|
|
151
159
|
return true;
|
|
152
160
|
}
|
|
153
161
|
|
|
162
|
+
// Check horizontal distance (word spacing)
|
|
154
163
|
let horizontal_gap = new_char.x - (last_char.x + last_char.width);
|
|
155
164
|
horizontal_gap > WORD_SPACING_THRESHOLD
|
|
156
165
|
}
|
|
@@ -164,12 +173,14 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
|
|
|
164
173
|
return None;
|
|
165
174
|
}
|
|
166
175
|
|
|
176
|
+
// Build word text
|
|
167
177
|
let text: String = chars.iter().map(|c| c.text).collect();
|
|
168
178
|
|
|
169
179
|
if text.len() < MIN_WORD_LENGTH {
|
|
170
180
|
return None;
|
|
171
181
|
}
|
|
172
182
|
|
|
183
|
+
// Calculate bounding box (encompassing all characters)
|
|
173
184
|
let left = chars
|
|
174
185
|
.iter()
|
|
175
186
|
.map(|c| c.x)
|
|
@@ -194,10 +205,14 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
|
|
|
194
205
|
let width = (right - left).round() as i32;
|
|
195
206
|
let height = (top - bottom).round() as i32;
|
|
196
207
|
|
|
208
|
+
// Convert PDF coordinates (bottom-left origin) to image coordinates (top-left origin)
|
|
209
|
+
// HocrWord expects top-left origin like images/OCR output
|
|
197
210
|
let top_in_image_coords = (page_height as f32 - top).round() as i32;
|
|
198
211
|
|
|
212
|
+
// PDF text has high confidence (no OCR uncertainty)
|
|
199
213
|
let confidence = 95.0;
|
|
200
214
|
|
|
215
|
+
// Apply confidence threshold
|
|
201
216
|
if confidence < min_confidence {
|
|
202
217
|
return None;
|
|
203
218
|
}
|
|
@@ -255,18 +270,20 @@ mod tests {
|
|
|
255
270
|
height: 12.0,
|
|
256
271
|
}];
|
|
257
272
|
|
|
273
|
+
// Close character - same word
|
|
258
274
|
let close_char = CharInfo {
|
|
259
275
|
text: 'B',
|
|
260
|
-
x: 111.0,
|
|
276
|
+
x: 111.0, // 1 unit gap
|
|
261
277
|
y: 50.0,
|
|
262
278
|
width: 10.0,
|
|
263
279
|
height: 12.0,
|
|
264
280
|
};
|
|
265
281
|
assert!(!should_start_new_word(&chars, &close_char));
|
|
266
282
|
|
|
283
|
+
// Far character - new word
|
|
267
284
|
let far_char = CharInfo {
|
|
268
285
|
text: 'C',
|
|
269
|
-
x: 120.0,
|
|
286
|
+
x: 120.0, // 10 unit gap (> WORD_SPACING_THRESHOLD)
|
|
270
287
|
y: 50.0,
|
|
271
288
|
width: 10.0,
|
|
272
289
|
height: 12.0,
|
|
@@ -284,10 +301,11 @@ mod tests {
|
|
|
284
301
|
height: 12.0,
|
|
285
302
|
}];
|
|
286
303
|
|
|
304
|
+
// Character on different line
|
|
287
305
|
let new_line_char = CharInfo {
|
|
288
306
|
text: 'B',
|
|
289
307
|
x: 100.0,
|
|
290
|
-
y: 70.0,
|
|
308
|
+
y: 70.0, // Different y
|
|
291
309
|
width: 10.0,
|
|
292
310
|
height: 12.0,
|
|
293
311
|
};
|
|
@@ -318,7 +336,7 @@ mod tests {
|
|
|
318
336
|
|
|
319
337
|
assert_eq!(word.text, "Hi");
|
|
320
338
|
assert_eq!(word.left, 100);
|
|
321
|
-
assert_eq!(word.width, 18);
|
|
339
|
+
assert_eq!(word.width, 18); // 110 + 8 - 100
|
|
322
340
|
assert_eq!(word.height, 12);
|
|
323
341
|
assert_eq!(word.confidence, 95.0);
|
|
324
342
|
}
|
|
@@ -340,19 +358,22 @@ mod tests {
|
|
|
340
358
|
height: 12.0,
|
|
341
359
|
}];
|
|
342
360
|
|
|
361
|
+
// Low threshold - should pass
|
|
343
362
|
let word = finalize_word(&chars, 800, 90.0);
|
|
344
363
|
assert!(word.is_some());
|
|
345
364
|
|
|
365
|
+
// High threshold - should fail
|
|
346
366
|
let word = finalize_word(&chars, 800, 96.0);
|
|
347
367
|
assert!(word.is_none());
|
|
348
368
|
}
|
|
349
369
|
|
|
350
370
|
#[test]
|
|
351
371
|
fn test_coordinate_conversion() {
|
|
372
|
+
// Test PDF coordinate (bottom-left origin) to image coordinate (top-left origin)
|
|
352
373
|
let chars = vec![CharInfo {
|
|
353
374
|
text: 'A',
|
|
354
375
|
x: 100.0,
|
|
355
|
-
y: 700.0,
|
|
376
|
+
y: 700.0, // PDF coordinates: bottom-left origin
|
|
356
377
|
width: 10.0,
|
|
357
378
|
height: 12.0,
|
|
358
379
|
}];
|
|
@@ -360,11 +381,13 @@ mod tests {
|
|
|
360
381
|
let page_height = 800;
|
|
361
382
|
let word = finalize_word(&chars, page_height, 0.0).unwrap();
|
|
362
383
|
|
|
384
|
+
// top_in_image_coords = page_height - (y + height) = 800 - (700 + 12) = 88
|
|
363
385
|
assert_eq!(word.top, 88);
|
|
364
386
|
}
|
|
365
387
|
|
|
366
388
|
#[test]
|
|
367
389
|
fn test_word_bounding_box() {
|
|
390
|
+
// Test that bounding box encompasses all characters
|
|
368
391
|
let chars = vec![
|
|
369
392
|
CharInfo {
|
|
370
393
|
text: 'A',
|
|
@@ -376,18 +399,22 @@ mod tests {
|
|
|
376
399
|
CharInfo {
|
|
377
400
|
text: 'B',
|
|
378
401
|
x: 110.0,
|
|
379
|
-
y: 51.0,
|
|
402
|
+
y: 51.0, // Slightly different y
|
|
380
403
|
width: 10.0,
|
|
381
|
-
height: 13.0,
|
|
404
|
+
height: 13.0, // Slightly different height
|
|
382
405
|
},
|
|
383
406
|
];
|
|
384
407
|
|
|
385
408
|
let word = finalize_word(&chars, 800, 0.0).unwrap();
|
|
386
409
|
|
|
410
|
+
// Left should be minimum x
|
|
387
411
|
assert_eq!(word.left, 100);
|
|
388
412
|
|
|
389
|
-
|
|
413
|
+
// Width should span from leftmost to rightmost character
|
|
414
|
+
assert_eq!(word.width, 20); // 120 - 100
|
|
390
415
|
|
|
416
|
+
// Height should encompass both characters
|
|
417
|
+
// max(y+height) - min(y) = max(51+13, 50+12) - 50 = 64 - 50 = 14
|
|
391
418
|
assert_eq!(word.height, 14);
|
|
392
419
|
}
|
|
393
420
|
}
|
|
@@ -92,6 +92,8 @@ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]
|
|
|
92
92
|
pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
|
|
93
93
|
let page_count = document.pages().len() as usize;
|
|
94
94
|
|
|
95
|
+
// Pre-allocate capacity based on estimated page size (average 2KB per page)
|
|
96
|
+
// This reduces memory reallocations during string concatenation
|
|
95
97
|
let estimated_size = page_count * 2048;
|
|
96
98
|
let mut content = String::with_capacity(estimated_size);
|
|
97
99
|
|
|
@@ -108,6 +110,7 @@ pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<Stri
|
|
|
108
110
|
content.push_str(&page_text);
|
|
109
111
|
}
|
|
110
112
|
|
|
113
|
+
// Shrink to actual size to free unused capacity
|
|
111
114
|
content.shrink_to_fit();
|
|
112
115
|
|
|
113
116
|
Ok(content)
|
|
@@ -361,6 +361,8 @@ pub trait DocumentExtractor: Plugin {
|
|
|
361
361
|
}
|
|
362
362
|
}
|
|
363
363
|
|
|
364
|
+
// Public registration APIs
|
|
365
|
+
|
|
364
366
|
/// Register a document extractor with the global registry.
|
|
365
367
|
///
|
|
366
368
|
/// The extractor will be registered for all MIME types it supports and will be
|
|
@@ -536,7 +538,6 @@ pub fn clear_extractors() -> crate::Result<()> {
|
|
|
536
538
|
#[cfg(test)]
|
|
537
539
|
mod tests {
|
|
538
540
|
use super::*;
|
|
539
|
-
use serial_test::serial;
|
|
540
541
|
|
|
541
542
|
struct MockExtractor {
|
|
542
543
|
mime_types: Vec<&'static str>,
|
|
@@ -829,8 +830,9 @@ mod tests {
|
|
|
829
830
|
assert_eq!(result.mime_type, "application/json");
|
|
830
831
|
}
|
|
831
832
|
|
|
833
|
+
// Tests for public registration APIs
|
|
834
|
+
|
|
832
835
|
#[test]
|
|
833
|
-
#[serial]
|
|
834
836
|
fn test_register_extractor() {
|
|
835
837
|
use std::sync::Arc;
|
|
836
838
|
|
|
@@ -845,7 +847,6 @@ mod tests {
|
|
|
845
847
|
}
|
|
846
848
|
|
|
847
849
|
#[test]
|
|
848
|
-
#[serial]
|
|
849
850
|
fn test_unregister_extractor() {
|
|
850
851
|
use std::sync::Arc;
|
|
851
852
|
|
|
@@ -860,14 +861,12 @@ mod tests {
|
|
|
860
861
|
}
|
|
861
862
|
|
|
862
863
|
#[test]
|
|
863
|
-
#[serial]
|
|
864
864
|
fn test_unregister_nonexistent_extractor() {
|
|
865
865
|
let result = super::unregister_extractor("nonexistent-extractor-xyz");
|
|
866
866
|
assert!(result.is_ok());
|
|
867
867
|
}
|
|
868
868
|
|
|
869
869
|
#[test]
|
|
870
|
-
#[serial]
|
|
871
870
|
fn test_list_extractors() {
|
|
872
871
|
use std::sync::Arc;
|
|
873
872
|
|
|
@@ -889,6 +888,7 @@ mod tests {
|
|
|
889
888
|
super::register_extractor(extractor2).unwrap();
|
|
890
889
|
|
|
891
890
|
let list = super::list_extractors().unwrap();
|
|
891
|
+
// Both extractors have the same name, so only one will be registered
|
|
892
892
|
assert_eq!(list.len(), 1);
|
|
893
893
|
assert!(list.contains(&"mock-extractor".to_string()));
|
|
894
894
|
|
|
@@ -896,7 +896,6 @@ mod tests {
|
|
|
896
896
|
}
|
|
897
897
|
|
|
898
898
|
#[test]
|
|
899
|
-
#[serial]
|
|
900
899
|
fn test_clear_extractors() {
|
|
901
900
|
use std::sync::Arc;
|
|
902
901
|
|
|
@@ -922,7 +921,6 @@ mod tests {
|
|
|
922
921
|
}
|
|
923
922
|
|
|
924
923
|
#[test]
|
|
925
|
-
#[serial]
|
|
926
924
|
fn test_register_extractor_with_invalid_name() {
|
|
927
925
|
use std::sync::Arc;
|
|
928
926
|
|
|
@@ -967,7 +965,6 @@ mod tests {
|
|
|
967
965
|
}
|
|
968
966
|
|
|
969
967
|
#[test]
|
|
970
|
-
#[serial]
|
|
971
968
|
fn test_register_extractor_with_empty_name() {
|
|
972
969
|
use std::sync::Arc;
|
|
973
970
|
|
|
@@ -81,8 +81,7 @@ pub enum OcrBackendType {
|
|
|
81
81
|
/// }
|
|
82
82
|
/// }
|
|
83
83
|
/// ```
|
|
84
|
-
#[
|
|
85
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
84
|
+
#[async_trait]
|
|
86
85
|
pub trait OcrBackend: Plugin {
|
|
87
86
|
/// Process an image and extract text via OCR.
|
|
88
87
|
///
|
|
@@ -253,6 +252,8 @@ pub trait OcrBackend: Plugin {
|
|
|
253
252
|
}
|
|
254
253
|
}
|
|
255
254
|
|
|
255
|
+
// Public registration APIs
|
|
256
|
+
|
|
256
257
|
/// Register an OCR backend with the global registry.
|
|
257
258
|
///
|
|
258
259
|
/// The OCR backend will be registered with its name from the `name()` method
|
|
@@ -319,6 +320,8 @@ pub fn register_ocr_backend(backend: Arc<dyn OcrBackend>) -> crate::Result<()> {
|
|
|
319
320
|
|
|
320
321
|
let registry = get_ocr_backend_registry();
|
|
321
322
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
323
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
324
|
+
// as it indicates the registry is in an inconsistent state.
|
|
322
325
|
let mut registry = registry
|
|
323
326
|
.write()
|
|
324
327
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -354,6 +357,8 @@ pub fn unregister_ocr_backend(name: &str) -> crate::Result<()> {
|
|
|
354
357
|
|
|
355
358
|
let registry = get_ocr_backend_registry();
|
|
356
359
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
360
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
361
|
+
// as it indicates the registry is in an inconsistent state.
|
|
357
362
|
let mut registry = registry
|
|
358
363
|
.write()
|
|
359
364
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -387,6 +392,8 @@ pub fn list_ocr_backends() -> crate::Result<Vec<String>> {
|
|
|
387
392
|
|
|
388
393
|
let registry = get_ocr_backend_registry();
|
|
389
394
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
395
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
396
|
+
// as it indicates the registry is in an inconsistent state.
|
|
390
397
|
let registry = registry
|
|
391
398
|
.read()
|
|
392
399
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -418,6 +425,8 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
|
|
|
418
425
|
|
|
419
426
|
let registry = get_ocr_backend_registry();
|
|
420
427
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
428
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
429
|
+
// as it indicates the registry is in an inconsistent state.
|
|
421
430
|
let mut registry = registry
|
|
422
431
|
.write()
|
|
423
432
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -105,8 +105,7 @@ pub enum ProcessingStage {
|
|
|
105
105
|
/// }
|
|
106
106
|
/// }
|
|
107
107
|
/// ```
|
|
108
|
-
#[
|
|
109
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
108
|
+
#[async_trait]
|
|
110
109
|
pub trait PostProcessor: Plugin {
|
|
111
110
|
/// Process an extraction result.
|
|
112
111
|
///
|
|
@@ -264,19 +264,10 @@ impl DocumentExtractorRegistry {
|
|
|
264
264
|
/// # Returns
|
|
265
265
|
///
|
|
266
266
|
/// The highest priority extractor, or an error if none found.
|
|
267
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
268
|
-
skip(self),
|
|
269
|
-
fields(
|
|
270
|
-
registry.mime_type = %mime_type,
|
|
271
|
-
registry.found = tracing::field::Empty,
|
|
272
|
-
)
|
|
273
|
-
))]
|
|
274
267
|
pub fn get(&self, mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
|
275
268
|
if let Some(priority_map) = self.extractors.get(mime_type)
|
|
276
269
|
&& let Some((_priority, extractor)) = priority_map.iter().next_back()
|
|
277
270
|
{
|
|
278
|
-
#[cfg(feature = "otel")]
|
|
279
|
-
tracing::Span::current().record("registry.found", true);
|
|
280
271
|
return Ok(Arc::clone(extractor));
|
|
281
272
|
}
|
|
282
273
|
|
|
@@ -302,13 +293,9 @@ impl DocumentExtractorRegistry {
|
|
|
302
293
|
}
|
|
303
294
|
|
|
304
295
|
if let Some((_priority, extractor)) = best_match {
|
|
305
|
-
#[cfg(feature = "otel")]
|
|
306
|
-
tracing::Span::current().record("registry.found", true);
|
|
307
296
|
return Ok(extractor);
|
|
308
297
|
}
|
|
309
298
|
|
|
310
|
-
#[cfg(feature = "otel")]
|
|
311
|
-
tracing::Span::current().record("registry.found", false);
|
|
312
299
|
Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
|
|
313
300
|
}
|
|
314
301
|
|
|
@@ -68,8 +68,7 @@ use std::sync::Arc;
|
|
|
68
68
|
/// }
|
|
69
69
|
/// }
|
|
70
70
|
/// ```
|
|
71
|
-
#[
|
|
72
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
71
|
+
#[async_trait]
|
|
73
72
|
pub trait Validator: Plugin {
|
|
74
73
|
/// Validate an extraction result.
|
|
75
74
|
///
|
|
@@ -276,6 +275,8 @@ pub trait Validator: Plugin {
|
|
|
276
275
|
}
|
|
277
276
|
}
|
|
278
277
|
|
|
278
|
+
// Public registration APIs
|
|
279
|
+
|
|
279
280
|
/// Register a validator with the global registry.
|
|
280
281
|
///
|
|
281
282
|
/// The validator will be registered with its default priority and will be called
|
|
@@ -811,8 +812,9 @@ mod tests {
|
|
|
811
812
|
assert!(validator.validate(&result, &config).await.is_ok());
|
|
812
813
|
}
|
|
813
814
|
|
|
815
|
+
// Tests for public registration APIs
|
|
816
|
+
|
|
814
817
|
#[test]
|
|
815
|
-
#[serial_test::serial]
|
|
816
818
|
fn test_register_validator() {
|
|
817
819
|
use std::sync::Arc;
|
|
818
820
|
|
|
@@ -824,7 +826,6 @@ mod tests {
|
|
|
824
826
|
}
|
|
825
827
|
|
|
826
828
|
#[test]
|
|
827
|
-
#[serial_test::serial]
|
|
828
829
|
fn test_unregister_validator() {
|
|
829
830
|
use std::sync::Arc;
|
|
830
831
|
|
|
@@ -836,20 +837,19 @@ mod tests {
|
|
|
836
837
|
}
|
|
837
838
|
|
|
838
839
|
#[test]
|
|
839
|
-
#[serial_test::serial]
|
|
840
840
|
fn test_unregister_nonexistent_validator() {
|
|
841
841
|
let result = super::unregister_validator("nonexistent-validator-xyz");
|
|
842
842
|
assert!(result.is_ok());
|
|
843
843
|
}
|
|
844
844
|
|
|
845
845
|
#[test]
|
|
846
|
-
#[serial_test::serial]
|
|
847
846
|
fn test_list_validators() {
|
|
848
847
|
use std::sync::Arc;
|
|
849
848
|
|
|
850
849
|
super::clear_validators().unwrap();
|
|
851
850
|
|
|
852
851
|
let validator1 = Arc::new(MockValidator { should_fail: false });
|
|
852
|
+
// Both validators have the same name, so only one will be registered
|
|
853
853
|
let validator2 = Arc::new(MockValidator { should_fail: false });
|
|
854
854
|
|
|
855
855
|
let list_before = super::list_validators().unwrap();
|
|
@@ -859,6 +859,7 @@ mod tests {
|
|
|
859
859
|
super::register_validator(validator2).unwrap();
|
|
860
860
|
|
|
861
861
|
let list = super::list_validators().unwrap();
|
|
862
|
+
// Only 1 validator registered since they have the same name
|
|
862
863
|
assert_eq!(list.len(), 1);
|
|
863
864
|
assert!(list.contains(&"mock-validator".to_string()));
|
|
864
865
|
|
|
@@ -866,7 +867,6 @@ mod tests {
|
|
|
866
867
|
}
|
|
867
868
|
|
|
868
869
|
#[test]
|
|
869
|
-
#[serial_test::serial]
|
|
870
870
|
fn test_clear_validators() {
|
|
871
871
|
use std::sync::Arc;
|
|
872
872
|
|
|
@@ -878,6 +878,7 @@ mod tests {
|
|
|
878
878
|
super::register_validator(validator1).unwrap();
|
|
879
879
|
super::register_validator(validator2).unwrap();
|
|
880
880
|
|
|
881
|
+
// Verify at least one validator is registered
|
|
881
882
|
let list_before = super::list_validators().unwrap();
|
|
882
883
|
assert!(!list_before.is_empty());
|
|
883
884
|
|
|
@@ -889,7 +890,6 @@ mod tests {
|
|
|
889
890
|
}
|
|
890
891
|
|
|
891
892
|
#[test]
|
|
892
|
-
#[serial_test::serial]
|
|
893
893
|
fn test_register_validator_with_invalid_name() {
|
|
894
894
|
use std::sync::Arc;
|
|
895
895
|
|
|
@@ -922,7 +922,6 @@ mod tests {
|
|
|
922
922
|
}
|
|
923
923
|
|
|
924
924
|
#[test]
|
|
925
|
-
#[serial_test::serial]
|
|
926
925
|
fn test_register_validator_with_empty_name() {
|
|
927
926
|
use std::sync::Arc;
|
|
928
927
|
|
|
@@ -100,7 +100,7 @@ macro_rules! embed_stopwords {
|
|
|
100
100
|
panic!(
|
|
101
101
|
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
102
102
|
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
103
|
-
Please report this issue at https://github.com/
|
|
103
|
+
Please report this issue at https://github.com/Goldziher/kreuzberg/issues",
|
|
104
104
|
$lang, e
|
|
105
105
|
);
|
|
106
106
|
}
|
|
@@ -1437,7 +1437,7 @@ mod tests {
|
|
|
1437
1437
|
let duration = start.elapsed();
|
|
1438
1438
|
|
|
1439
1439
|
assert!(
|
|
1440
|
-
duration.as_millis() <
|
|
1440
|
+
duration.as_millis() < 100,
|
|
1441
1441
|
"30,000 lookups took too long: {:?}",
|
|
1442
1442
|
duration
|
|
1443
1443
|
);
|
|
@@ -844,6 +844,18 @@ pub struct CacheStats {
|
|
|
844
844
|
pub newest_file_age_days: f64,
|
|
845
845
|
}
|
|
846
846
|
|
|
847
|
+
/// Pandoc extraction result.
|
|
848
|
+
///
|
|
849
|
+
/// Result of extracting content from a document using Pandoc,
|
|
850
|
+
/// including text and any metadata Pandoc was able to extract.
|
|
851
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
852
|
+
pub struct PandocExtractionResult {
|
|
853
|
+
/// Extracted text content
|
|
854
|
+
pub content: String,
|
|
855
|
+
/// Metadata extracted by Pandoc (varies by format)
|
|
856
|
+
pub metadata: HashMap<String, serde_json::Value>,
|
|
857
|
+
}
|
|
858
|
+
|
|
847
859
|
/// LibreOffice conversion result.
|
|
848
860
|
///
|
|
849
861
|
/// Result of converting a legacy office document (e.g., .doc, .ppt)
|
|
@@ -859,45 +871,3 @@ pub struct LibreOfficeConversionResult {
|
|
|
859
871
|
/// Target MIME type after conversion
|
|
860
872
|
pub target_mime: String,
|
|
861
873
|
}
|
|
862
|
-
|
|
863
|
-
#[cfg(test)]
|
|
864
|
-
mod tests {
|
|
865
|
-
use super::*;
|
|
866
|
-
|
|
867
|
-
#[test]
|
|
868
|
-
fn test_metadata_serialization_with_format() {
|
|
869
|
-
let mut metadata = Metadata {
|
|
870
|
-
format: Some(FormatMetadata::Text(TextMetadata {
|
|
871
|
-
line_count: 1,
|
|
872
|
-
word_count: 2,
|
|
873
|
-
character_count: 13,
|
|
874
|
-
headers: None,
|
|
875
|
-
links: None,
|
|
876
|
-
code_blocks: None,
|
|
877
|
-
})),
|
|
878
|
-
..Default::default()
|
|
879
|
-
};
|
|
880
|
-
|
|
881
|
-
metadata
|
|
882
|
-
.additional
|
|
883
|
-
.insert("quality_score".to_string(), serde_json::json!(1.0));
|
|
884
|
-
|
|
885
|
-
let json = serde_json::to_value(&metadata).unwrap();
|
|
886
|
-
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
887
|
-
|
|
888
|
-
// Check that format_type is present
|
|
889
|
-
assert!(
|
|
890
|
-
json.get("format_type").is_some(),
|
|
891
|
-
"format_type should be present in serialized JSON"
|
|
892
|
-
);
|
|
893
|
-
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
894
|
-
|
|
895
|
-
// Check that Text metadata fields are present
|
|
896
|
-
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
897
|
-
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
898
|
-
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
899
|
-
|
|
900
|
-
// Check that additional field is merged
|
|
901
|
-
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
902
|
-
}
|
|
903
|
-
}
|
|
@@ -19,18 +19,6 @@ use kreuzberg::core::extractor::extract_file_sync;
|
|
|
19
19
|
|
|
20
20
|
mod helpers;
|
|
21
21
|
|
|
22
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
23
|
-
value.trim_end_matches(['\n', '\r'])
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
27
|
-
assert_eq!(
|
|
28
|
-
trim_trailing_newlines(actual),
|
|
29
|
-
expected,
|
|
30
|
-
"Content mismatch after trimming trailing newlines"
|
|
31
|
-
);
|
|
32
|
-
}
|
|
33
|
-
|
|
34
22
|
/// Test that batch extraction processes documents in parallel.
|
|
35
23
|
///
|
|
36
24
|
/// Validates:
|
|
@@ -317,8 +305,7 @@ async fn test_batch_bytes_parallel_processing() {
|
|
|
317
305
|
assert_eq!(results.len(), 30);
|
|
318
306
|
|
|
319
307
|
for (i, result) in results.iter().enumerate() {
|
|
320
|
-
|
|
321
|
-
assert_text_content(&result.content, &expected);
|
|
308
|
+
assert_eq!(result.content, format!("Test content number {}", i));
|
|
322
309
|
}
|
|
323
310
|
|
|
324
311
|
println!("Batch processed 30 byte arrays in {:?}", duration);
|
|
@@ -343,9 +330,9 @@ async fn test_batch_bytes_mixed_valid_invalid() {
|
|
|
343
330
|
let results = results.unwrap();
|
|
344
331
|
assert_eq!(results.len(), 5);
|
|
345
332
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
333
|
+
assert_eq!(results[0].content, "valid content 1");
|
|
334
|
+
assert_eq!(results[2].content, "valid content 2");
|
|
335
|
+
assert_eq!(results[4].content, "valid content 3");
|
|
349
336
|
|
|
350
337
|
assert!(results[1].metadata.error.is_some());
|
|
351
338
|
assert!(results[3].metadata.error.is_some());
|
|
@@ -547,8 +534,7 @@ async fn test_batch_accuracy_under_load() {
|
|
|
547
534
|
for (i, result) in results.iter().enumerate() {
|
|
548
535
|
let expected = format!("Document number {} with unique content", i);
|
|
549
536
|
assert_eq!(
|
|
550
|
-
|
|
551
|
-
expected,
|
|
537
|
+
result.content, expected,
|
|
552
538
|
"Document {} content mismatch - possible cross-contamination",
|
|
553
539
|
i
|
|
554
540
|
);
|