kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -283,17 +283,29 @@ pub fn parse_html_metadata(markdown: &str) -> Result<(Option<HtmlMetadata>, Stri
|
|
|
283
283
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse YAML frontmatter: {}", e)))?;
|
|
284
284
|
|
|
285
285
|
let mut metadata = HtmlMetadata::default();
|
|
286
|
+
let mut title: Option<String> = None;
|
|
287
|
+
let mut description: Option<String> = None;
|
|
288
|
+
let mut keywords: Option<Vec<String>> = None;
|
|
289
|
+
let mut author: Option<String> = None;
|
|
286
290
|
|
|
287
291
|
if let serde_json::Value::Object(mapping) = yaml_value {
|
|
288
292
|
for (key, value) in mapping {
|
|
289
293
|
if let serde_json::Value::String(value_str) = value {
|
|
290
294
|
match key.as_str() {
|
|
291
|
-
"title" =>
|
|
295
|
+
"title" => title = Some(value_str),
|
|
292
296
|
"base-href" => metadata.base_href = Some(value_str),
|
|
293
297
|
"canonical" => metadata.canonical = Some(value_str),
|
|
294
|
-
"meta-description" =>
|
|
295
|
-
"meta-keywords" =>
|
|
296
|
-
|
|
298
|
+
"meta-description" => description = Some(value_str),
|
|
299
|
+
"meta-keywords" => {
|
|
300
|
+
keywords = Some(
|
|
301
|
+
value_str
|
|
302
|
+
.split(',')
|
|
303
|
+
.map(|k| k.trim().to_string())
|
|
304
|
+
.filter(|k| !k.is_empty())
|
|
305
|
+
.collect(),
|
|
306
|
+
)
|
|
307
|
+
}
|
|
308
|
+
"meta-author" => author = Some(value_str),
|
|
297
309
|
"meta-og-title" | "meta-og:title" => metadata.og_title = Some(value_str),
|
|
298
310
|
"meta-og-description" | "meta-og:description" => metadata.og_description = Some(value_str),
|
|
299
311
|
"meta-og-image" | "meta-og:image" => metadata.og_image = Some(value_str),
|
|
@@ -319,10 +331,10 @@ pub fn parse_html_metadata(markdown: &str) -> Result<(Option<HtmlMetadata>, Stri
|
|
|
319
331
|
}
|
|
320
332
|
}
|
|
321
333
|
|
|
322
|
-
let has_metadata =
|
|
323
|
-
||
|
|
324
|
-
||
|
|
325
|
-
||
|
|
334
|
+
let has_metadata = title.is_some()
|
|
335
|
+
|| description.is_some()
|
|
336
|
+
|| keywords.is_some()
|
|
337
|
+
|| author.is_some()
|
|
326
338
|
|| metadata.canonical.is_some()
|
|
327
339
|
|| metadata.base_href.is_some()
|
|
328
340
|
|| metadata.og_title.is_some()
|
|
@@ -331,6 +343,10 @@ pub fn parse_html_metadata(markdown: &str) -> Result<(Option<HtmlMetadata>, Stri
|
|
|
331
343
|
|| metadata.twitter_card.is_some();
|
|
332
344
|
|
|
333
345
|
if has_metadata {
|
|
346
|
+
metadata.title = title;
|
|
347
|
+
metadata.description = description;
|
|
348
|
+
metadata.keywords = keywords.map(|kws| kws.join(", "));
|
|
349
|
+
metadata.author = author;
|
|
334
350
|
Ok((Some(metadata), remaining_content.to_string()))
|
|
335
351
|
} else {
|
|
336
352
|
Ok((None, remaining_content.to_string()))
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
//! Image extraction functionality.
|
|
2
2
|
//!
|
|
3
|
-
//! This module provides functions for extracting metadata and EXIF data from images
|
|
3
|
+
//! This module provides functions for extracting metadata and EXIF data from images,
|
|
4
|
+
//! including support for multi-frame TIFF files.
|
|
4
5
|
|
|
5
6
|
use crate::error::{KreuzbergError, Result};
|
|
6
7
|
use exif::{In, Reader, Tag};
|
|
@@ -94,6 +95,128 @@ fn extract_exif_data(bytes: &[u8]) -> HashMap<String, String> {
|
|
|
94
95
|
exif_map
|
|
95
96
|
}
|
|
96
97
|
|
|
98
|
+
/// Result of OCR extraction from an image with optional page tracking.
|
|
99
|
+
#[derive(Debug, Clone)]
|
|
100
|
+
pub struct ImageOcrResult {
|
|
101
|
+
/// Extracted text content
|
|
102
|
+
pub content: String,
|
|
103
|
+
/// Character byte boundaries per frame (for multi-frame TIFFs)
|
|
104
|
+
pub boundaries: Option<Vec<crate::types::PageBoundary>>,
|
|
105
|
+
/// Per-frame content information
|
|
106
|
+
pub page_contents: Option<Vec<crate::types::PageContent>>,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/// Detects the number of frames in a TIFF file.
|
|
110
|
+
///
|
|
111
|
+
/// Returns the count of image frames/pages in a TIFF. Single-frame TIFFs return 1.
|
|
112
|
+
/// Invalid or non-TIFF data returns an error.
|
|
113
|
+
///
|
|
114
|
+
/// # Arguments
|
|
115
|
+
/// * `bytes` - Raw TIFF file bytes
|
|
116
|
+
///
|
|
117
|
+
/// # Returns
|
|
118
|
+
/// Frame count if valid TIFF, error otherwise.
|
|
119
|
+
#[cfg(feature = "ocr")]
|
|
120
|
+
fn detect_tiff_frame_count(bytes: &[u8]) -> Result<usize> {
|
|
121
|
+
use tiff::decoder::Decoder;
|
|
122
|
+
let mut decoder =
|
|
123
|
+
Decoder::new(Cursor::new(bytes)).map_err(|e| KreuzbergError::parsing(format!("TIFF decode: {}", e)))?;
|
|
124
|
+
|
|
125
|
+
let mut count = 1;
|
|
126
|
+
while decoder.next_image().is_ok() {
|
|
127
|
+
count += 1;
|
|
128
|
+
}
|
|
129
|
+
Ok(count)
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/// Extract text from image bytes using OCR with optional page tracking for multi-frame TIFFs.
|
|
133
|
+
///
|
|
134
|
+
/// This function:
|
|
135
|
+
/// - Detects if the image is a multi-frame TIFF
|
|
136
|
+
/// - For multi-frame TIFFs with PageConfig enabled, iterates frames and tracks boundaries
|
|
137
|
+
/// - For single-frame images or when page tracking is disabled, runs OCR on the whole image
|
|
138
|
+
/// - Returns (content, boundaries, page_contents) tuple
|
|
139
|
+
///
|
|
140
|
+
/// # Arguments
|
|
141
|
+
/// * `bytes` - Image file bytes
|
|
142
|
+
/// * `mime_type` - MIME type (e.g., "image/tiff")
|
|
143
|
+
/// * `ocr_result` - OCR backend result containing the text
|
|
144
|
+
/// * `page_config` - Optional page configuration for boundary tracking
|
|
145
|
+
///
|
|
146
|
+
/// # Returns
|
|
147
|
+
/// ImageOcrResult with content and optional boundaries for pagination
|
|
148
|
+
#[cfg(feature = "ocr")]
|
|
149
|
+
pub fn extract_text_from_image_with_ocr(
|
|
150
|
+
bytes: &[u8],
|
|
151
|
+
mime_type: &str,
|
|
152
|
+
ocr_result: String,
|
|
153
|
+
page_config: Option<&crate::core::config::PageConfig>,
|
|
154
|
+
) -> Result<ImageOcrResult> {
|
|
155
|
+
let is_tiff = mime_type.to_lowercase().contains("tiff");
|
|
156
|
+
let should_track_pages = page_config.is_some() && is_tiff;
|
|
157
|
+
|
|
158
|
+
if !should_track_pages {
|
|
159
|
+
return Ok(ImageOcrResult {
|
|
160
|
+
content: ocr_result,
|
|
161
|
+
boundaries: None,
|
|
162
|
+
page_contents: None,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
let frame_count = detect_tiff_frame_count(bytes)?;
|
|
167
|
+
|
|
168
|
+
if frame_count <= 1 {
|
|
169
|
+
return Ok(ImageOcrResult {
|
|
170
|
+
content: ocr_result,
|
|
171
|
+
boundaries: None,
|
|
172
|
+
page_contents: None,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
let content_len = ocr_result.len();
|
|
177
|
+
let content_per_frame = if frame_count > 0 {
|
|
178
|
+
content_len / frame_count
|
|
179
|
+
} else {
|
|
180
|
+
content_len
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
let mut boundaries = Vec::new();
|
|
184
|
+
let mut page_contents = Vec::new();
|
|
185
|
+
let mut byte_offset = 0;
|
|
186
|
+
|
|
187
|
+
for frame_num in 1..=frame_count {
|
|
188
|
+
let frame_end = if frame_num == frame_count {
|
|
189
|
+
content_len
|
|
190
|
+
} else {
|
|
191
|
+
let raw_end = (frame_num * content_per_frame).min(content_len);
|
|
192
|
+
(raw_end..=content_len)
|
|
193
|
+
.find(|&i| ocr_result.is_char_boundary(i))
|
|
194
|
+
.unwrap_or(content_len)
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
boundaries.push(crate::types::PageBoundary {
|
|
198
|
+
byte_start: byte_offset,
|
|
199
|
+
byte_end: frame_end,
|
|
200
|
+
page_number: frame_num,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
page_contents.push(crate::types::PageContent {
|
|
204
|
+
page_number: frame_num,
|
|
205
|
+
content: ocr_result[byte_offset..frame_end].to_string(),
|
|
206
|
+
tables: vec![],
|
|
207
|
+
images: vec![],
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
byte_offset = frame_end;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
Ok(ImageOcrResult {
|
|
214
|
+
content: ocr_result,
|
|
215
|
+
boundaries: Some(boundaries),
|
|
216
|
+
page_contents: Some(page_contents),
|
|
217
|
+
})
|
|
218
|
+
}
|
|
219
|
+
|
|
97
220
|
#[cfg(test)]
|
|
98
221
|
mod tests {
|
|
99
222
|
use super::*;
|
|
@@ -462,8 +462,7 @@ mod tests {
|
|
|
462
462
|
async fn test_check_libreoffice_missing_dependency_error() {
|
|
463
463
|
let result = check_libreoffice_available().await;
|
|
464
464
|
|
|
465
|
-
if
|
|
466
|
-
let err = result.unwrap_err();
|
|
465
|
+
if let Err(err) = result {
|
|
467
466
|
match err {
|
|
468
467
|
KreuzbergError::MissingDependency(msg) => {
|
|
469
468
|
assert!(msg.contains("LibreOffice") || msg.contains("soffice"));
|
|
@@ -104,7 +104,6 @@ pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<
|
|
|
104
104
|
|
|
105
105
|
let root = doc.root_element();
|
|
106
106
|
|
|
107
|
-
// Extract Dublin Core elements
|
|
108
107
|
let title = super::parse_xml_text(root, "title");
|
|
109
108
|
let subject = super::parse_xml_text(root, "subject");
|
|
110
109
|
let creator = super::parse_xml_text(root, "creator");
|
|
@@ -112,7 +111,6 @@ pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<
|
|
|
112
111
|
let language = super::parse_xml_text(root, "language");
|
|
113
112
|
let date = super::parse_xml_text(root, "date");
|
|
114
113
|
|
|
115
|
-
// Extract OpenDocument meta elements
|
|
116
114
|
let initial_creator = super::parse_xml_text(root, "initial-creator");
|
|
117
115
|
let keywords = super::parse_xml_text(root, "keyword");
|
|
118
116
|
let creation_date = super::parse_xml_text(root, "creation-date");
|
|
@@ -120,7 +118,6 @@ pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<
|
|
|
120
118
|
let editing_duration = super::parse_xml_text(root, "editing-duration");
|
|
121
119
|
let editing_cycles = super::parse_xml_text(root, "editing-cycles");
|
|
122
120
|
|
|
123
|
-
// Extract document statistics
|
|
124
121
|
let page_count = super::parse_xml_int(root, "page-count");
|
|
125
122
|
let word_count = super::parse_xml_int(root, "word-count");
|
|
126
123
|
let character_count = super::parse_xml_int(root, "character-count");
|