kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -12,6 +12,7 @@ use kreuzberg::plugins::registry::{DocumentExtractorRegistry, ValidatorRegistry}
|
|
|
12
12
|
use kreuzberg::plugins::{DocumentExtractor, Plugin, Validator};
|
|
13
13
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
14
14
|
use kreuzberg::{KreuzbergError, Result};
|
|
15
|
+
use std::borrow::Cow;
|
|
15
16
|
use std::path::Path;
|
|
16
17
|
use std::sync::Arc;
|
|
17
18
|
|
|
@@ -119,7 +120,7 @@ impl DocumentExtractor for MockExtractor {
|
|
|
119
120
|
) -> Result<ExtractionResult> {
|
|
120
121
|
Ok(ExtractionResult {
|
|
121
122
|
content: format!("Extracted by {}: {}", self.name, String::from_utf8_lossy(content)),
|
|
122
|
-
mime_type: mime_type.to_string(),
|
|
123
|
+
mime_type: Cow::Owned(mime_type.to_string()),
|
|
123
124
|
metadata: Metadata::default(),
|
|
124
125
|
tables: vec![],
|
|
125
126
|
detected_languages: None,
|
|
@@ -28,9 +28,10 @@ serde_json = { workspace = true }
|
|
|
28
28
|
serde = { workspace = true }
|
|
29
29
|
async-trait = { workspace = true }
|
|
30
30
|
tokio = { workspace = true }
|
|
31
|
-
html-to-markdown-rs = { version = "2.24.
|
|
31
|
+
html-to-markdown-rs = { version = "2.24.4", default-features = false }
|
|
32
32
|
rayon = { version = "1.11", optional = true }
|
|
33
33
|
log = "0.4"
|
|
34
|
+
ahash = "0.8"
|
|
34
35
|
|
|
35
36
|
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
36
37
|
kreuzberg = { path = "../kreuzberg", features = [
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
|
|
2
2
|
use kreuzberg::types::{Chunk, ChunkMetadata, ExtractionResult, Metadata, PageStructure, PageUnitType};
|
|
3
3
|
use kreuzberg_ffi::{CExtractionResultView, kreuzberg_get_result_view};
|
|
4
|
+
use std::borrow::Cow;
|
|
4
5
|
use std::ffi::CString;
|
|
5
6
|
use std::hint;
|
|
6
7
|
use std::mem;
|
|
@@ -63,7 +64,7 @@ fn create_test_result(content_size: usize, chunk_count: usize) -> ExtractionResu
|
|
|
63
64
|
|
|
64
65
|
ExtractionResult {
|
|
65
66
|
content,
|
|
66
|
-
mime_type: "application/pdf"
|
|
67
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
67
68
|
metadata,
|
|
68
69
|
tables: vec![],
|
|
69
70
|
detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
|
|
@@ -109,7 +110,7 @@ fn bench_copy_based_approach(c: &mut Criterion) {
|
|
|
109
110
|
|
|
110
111
|
b.iter(|| {
|
|
111
112
|
let content_cstr = CString::new(result.content.as_str()).unwrap();
|
|
112
|
-
let mime_cstr = CString::new(result.mime_type
|
|
113
|
+
let mime_cstr = CString::new(&*result.mime_type).unwrap();
|
|
113
114
|
let language_cstr = result
|
|
114
115
|
.metadata
|
|
115
116
|
.language
|
|
@@ -1208,6 +1208,38 @@ const char *kreuzberg_error_code_description(uint32_t code);
|
|
|
1208
1208
|
*/
|
|
1209
1209
|
struct CErrorDetails kreuzberg_get_error_details(void);
|
|
1210
1210
|
|
|
1211
|
+
/**
|
|
1212
|
+
* Heap-allocated variant of `kreuzberg_get_error_details` that returns a pointer.
|
|
1213
|
+
*
|
|
1214
|
+
* This is the preferred variant for language bindings (Java, Go, C#) where
|
|
1215
|
+
* returning structs by value across FFI boundaries causes ABI issues,
|
|
1216
|
+
* particularly on ARM64.
|
|
1217
|
+
*
|
|
1218
|
+
* The returned pointer must be freed with `kreuzberg_free_error_details()`.
|
|
1219
|
+
* Returns NULL if allocation fails.
|
|
1220
|
+
*
|
|
1221
|
+
* # C Signature
|
|
1222
|
+
*
|
|
1223
|
+
* ```c
|
|
1224
|
+
* CErrorDetails* kreuzberg_get_error_details_ptr(void);
|
|
1225
|
+
* ```
|
|
1226
|
+
*/
|
|
1227
|
+
struct CErrorDetails *kreuzberg_get_error_details_ptr(void);
|
|
1228
|
+
|
|
1229
|
+
/**
|
|
1230
|
+
* Frees a `CErrorDetails` pointer returned by `kreuzberg_get_error_details_ptr()`.
|
|
1231
|
+
*
|
|
1232
|
+
* This function frees all internal string fields and the struct itself.
|
|
1233
|
+
* Passing NULL is a no-op.
|
|
1234
|
+
*
|
|
1235
|
+
* # C Signature
|
|
1236
|
+
*
|
|
1237
|
+
* ```c
|
|
1238
|
+
* void kreuzberg_free_error_details(CErrorDetails* details);
|
|
1239
|
+
* ```
|
|
1240
|
+
*/
|
|
1241
|
+
void kreuzberg_free_error_details(struct CErrorDetails *details);
|
|
1242
|
+
|
|
1211
1243
|
/**
|
|
1212
1244
|
* Classifies an error based on the error message string.
|
|
1213
1245
|
*
|
|
@@ -540,6 +540,62 @@ pub extern "C" fn kreuzberg_get_error_details() -> CErrorDetails {
|
|
|
540
540
|
}
|
|
541
541
|
}
|
|
542
542
|
|
|
543
|
+
/// Heap-allocated variant of `kreuzberg_get_error_details` that returns a pointer.
|
|
544
|
+
///
|
|
545
|
+
/// This is the preferred variant for language bindings (Java, Go, C#) where
|
|
546
|
+
/// returning structs by value across FFI boundaries causes ABI issues,
|
|
547
|
+
/// particularly on ARM64.
|
|
548
|
+
///
|
|
549
|
+
/// The returned pointer must be freed with `kreuzberg_free_error_details()`.
|
|
550
|
+
/// Returns NULL if allocation fails.
|
|
551
|
+
///
|
|
552
|
+
/// # C Signature
|
|
553
|
+
///
|
|
554
|
+
/// ```c
|
|
555
|
+
/// CErrorDetails* kreuzberg_get_error_details_ptr(void);
|
|
556
|
+
/// ```
|
|
557
|
+
#[unsafe(no_mangle)]
|
|
558
|
+
pub extern "C" fn kreuzberg_get_error_details_ptr() -> *mut CErrorDetails {
|
|
559
|
+
let details = kreuzberg_get_error_details();
|
|
560
|
+
Box::into_raw(Box::new(details))
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/// Frees a `CErrorDetails` pointer returned by `kreuzberg_get_error_details_ptr()`.
|
|
564
|
+
///
|
|
565
|
+
/// This function frees all internal string fields and the struct itself.
|
|
566
|
+
/// Passing NULL is a no-op.
|
|
567
|
+
///
|
|
568
|
+
/// # C Signature
|
|
569
|
+
///
|
|
570
|
+
/// ```c
|
|
571
|
+
/// void kreuzberg_free_error_details(CErrorDetails* details);
|
|
572
|
+
/// ```
|
|
573
|
+
#[unsafe(no_mangle)]
|
|
574
|
+
pub extern "C" fn kreuzberg_free_error_details(details: *mut CErrorDetails) {
|
|
575
|
+
if details.is_null() {
|
|
576
|
+
return;
|
|
577
|
+
}
|
|
578
|
+
unsafe {
|
|
579
|
+
let details = Box::from_raw(details);
|
|
580
|
+
// Free all non-null string fields
|
|
581
|
+
if !details.message.is_null() {
|
|
582
|
+
let _ = CString::from_raw(details.message);
|
|
583
|
+
}
|
|
584
|
+
if !details.error_type.is_null() {
|
|
585
|
+
let _ = CString::from_raw(details.error_type);
|
|
586
|
+
}
|
|
587
|
+
if !details.source_file.is_null() {
|
|
588
|
+
let _ = CString::from_raw(details.source_file);
|
|
589
|
+
}
|
|
590
|
+
if !details.source_function.is_null() {
|
|
591
|
+
let _ = CString::from_raw(details.source_function);
|
|
592
|
+
}
|
|
593
|
+
if !details.context_info.is_null() {
|
|
594
|
+
let _ = CString::from_raw(details.context_info);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
|
|
543
599
|
/// Classifies an error based on the error message string.
|
|
544
600
|
///
|
|
545
601
|
/// Analyzes an error message and attempts to classify it into one of the standard
|
|
@@ -81,7 +81,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
81
81
|
);
|
|
82
82
|
|
|
83
83
|
let mime_type_guard = CStringGuard::new(
|
|
84
|
-
CString::new(mime_type).map_err(|e| format!("Failed to convert MIME type to C string: {}", e))?,
|
|
84
|
+
CString::new(mime_type.to_string()).map_err(|e| format!("Failed to convert MIME type to C string: {}", e))?,
|
|
85
85
|
);
|
|
86
86
|
|
|
87
87
|
let language_guard = match &metadata.language {
|
|
@@ -213,6 +213,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
213
213
|
mod tests {
|
|
214
214
|
use super::*;
|
|
215
215
|
use kreuzberg::types::{Chunk, ChunkMetadata, ExtractionResult, Metadata, Table};
|
|
216
|
+
use std::borrow::Cow;
|
|
216
217
|
use std::ffi::CStr;
|
|
217
218
|
|
|
218
219
|
#[test]
|
|
@@ -352,7 +353,7 @@ mod tests {
|
|
|
352
353
|
fn test_to_c_extraction_result_basic() {
|
|
353
354
|
let result = ExtractionResult {
|
|
354
355
|
content: "Test content".to_string(),
|
|
355
|
-
mime_type: "text/plain"
|
|
356
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
356
357
|
metadata: Metadata::default(),
|
|
357
358
|
tables: vec![],
|
|
358
359
|
detected_languages: None,
|
|
@@ -391,7 +392,7 @@ mod tests {
|
|
|
391
392
|
fn test_to_c_extraction_result_with_null_bytes() {
|
|
392
393
|
let result = ExtractionResult {
|
|
393
394
|
content: "Test\0content with null".to_string(),
|
|
394
|
-
mime_type: "text/plain"
|
|
395
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
395
396
|
metadata: Metadata::default(),
|
|
396
397
|
tables: vec![],
|
|
397
398
|
detected_languages: None,
|
|
@@ -440,7 +441,7 @@ mod tests {
|
|
|
440
441
|
|
|
441
442
|
let result = ExtractionResult {
|
|
442
443
|
content: "Test content".to_string(),
|
|
443
|
-
mime_type: "text/plain"
|
|
444
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
444
445
|
metadata,
|
|
445
446
|
tables: vec![],
|
|
446
447
|
detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
|
|
@@ -519,7 +520,7 @@ mod tests {
|
|
|
519
520
|
|
|
520
521
|
let result = ExtractionResult {
|
|
521
522
|
content: "Test content".to_string(),
|
|
522
|
-
mime_type: "text/plain"
|
|
523
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
523
524
|
metadata: Metadata::default(),
|
|
524
525
|
tables: vec![table],
|
|
525
526
|
detected_languages: None,
|
|
@@ -160,7 +160,7 @@ impl OcrBackend for FfiOcrBackend {
|
|
|
160
160
|
|
|
161
161
|
Ok(ExtractionResult {
|
|
162
162
|
content: result_text,
|
|
163
|
-
mime_type: "text/plain"
|
|
163
|
+
mime_type: std::borrow::Cow::Borrowed("text/plain"),
|
|
164
164
|
metadata: kreuzberg::types::Metadata::default(),
|
|
165
165
|
tables: vec![],
|
|
166
166
|
detected_languages: None,
|
|
@@ -368,6 +368,7 @@ pub unsafe extern "C" fn kreuzberg_result_get_metadata_field(
|
|
|
368
368
|
#[cfg(test)]
|
|
369
369
|
mod tests {
|
|
370
370
|
use super::*;
|
|
371
|
+
use std::borrow::Cow;
|
|
371
372
|
use std::ffi::CStr;
|
|
372
373
|
|
|
373
374
|
fn create_test_result() -> ExtractionResult {
|
|
@@ -389,7 +390,7 @@ mod tests {
|
|
|
389
390
|
|
|
390
391
|
ExtractionResult {
|
|
391
392
|
content: "Sample content for testing".to_string(),
|
|
392
|
-
mime_type: "text/plain"
|
|
393
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
393
394
|
metadata,
|
|
394
395
|
tables: vec![],
|
|
395
396
|
detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
|
|
@@ -398,6 +398,7 @@ pub unsafe extern "C" fn kreuzberg_view_get_mime_type(
|
|
|
398
398
|
mod tests {
|
|
399
399
|
use super::*;
|
|
400
400
|
use kreuzberg::types::{Metadata, PageStructure, PageUnitType};
|
|
401
|
+
use std::borrow::Cow;
|
|
401
402
|
use std::mem;
|
|
402
403
|
|
|
403
404
|
fn create_test_result() -> ExtractionResult {
|
|
@@ -419,7 +420,7 @@ mod tests {
|
|
|
419
420
|
|
|
420
421
|
ExtractionResult {
|
|
421
422
|
content: "Sample content for zero-copy testing".to_string(),
|
|
422
|
-
mime_type: "text/plain"
|
|
423
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
423
424
|
metadata,
|
|
424
425
|
tables: vec![],
|
|
425
426
|
detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
|
|
@@ -712,7 +713,7 @@ mod tests {
|
|
|
712
713
|
fn test_view_all_counts_zero() {
|
|
713
714
|
let result = ExtractionResult {
|
|
714
715
|
content: "Minimal content".to_string(),
|
|
715
|
-
mime_type: "text/plain"
|
|
716
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
716
717
|
metadata: Metadata::default(),
|
|
717
718
|
tables: vec![],
|
|
718
719
|
detected_languages: None,
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
//! ```
|
|
40
40
|
|
|
41
41
|
use crate::{clear_last_error, set_last_error};
|
|
42
|
-
use
|
|
42
|
+
use ahash::AHashMap;
|
|
43
43
|
use std::ffi::{CStr, CString};
|
|
44
44
|
use std::os::raw::c_char;
|
|
45
45
|
use std::ptr;
|
|
@@ -82,7 +82,7 @@ struct InternedString {
|
|
|
82
82
|
/// Global string interning table.
|
|
83
83
|
struct StringInternTable {
|
|
84
84
|
/// Map from string content to interned entry
|
|
85
|
-
strings:
|
|
85
|
+
strings: AHashMap<String, InternedString>,
|
|
86
86
|
|
|
87
87
|
/// Total number of intern requests
|
|
88
88
|
total_requests: usize,
|
|
@@ -95,7 +95,7 @@ impl StringInternTable {
|
|
|
95
95
|
/// Create new intern table with pre-populated common strings.
|
|
96
96
|
fn new() -> Self {
|
|
97
97
|
let mut table = Self {
|
|
98
|
-
strings:
|
|
98
|
+
strings: AHashMap::new(),
|
|
99
99
|
total_requests: 0,
|
|
100
100
|
cache_hits: 0,
|
|
101
101
|
};
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01
|
|
11
|
+
date: 2026-02-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|