kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -283,17 +283,29 @@ pub fn parse_html_metadata(markdown: &str) -> Result<(Option<HtmlMetadata>, Stri
283
283
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse YAML frontmatter: {}", e)))?;
284
284
 
285
285
  let mut metadata = HtmlMetadata::default();
286
+ let mut title: Option<String> = None;
287
+ let mut description: Option<String> = None;
288
+ let mut keywords: Option<Vec<String>> = None;
289
+ let mut author: Option<String> = None;
286
290
 
287
291
  if let serde_json::Value::Object(mapping) = yaml_value {
288
292
  for (key, value) in mapping {
289
293
  if let serde_json::Value::String(value_str) = value {
290
294
  match key.as_str() {
291
- "title" => metadata.title = Some(value_str),
295
+ "title" => title = Some(value_str),
292
296
  "base-href" => metadata.base_href = Some(value_str),
293
297
  "canonical" => metadata.canonical = Some(value_str),
294
- "meta-description" => metadata.description = Some(value_str),
295
- "meta-keywords" => metadata.keywords = Some(value_str),
296
- "meta-author" => metadata.author = Some(value_str),
298
+ "meta-description" => description = Some(value_str),
299
+ "meta-keywords" => {
300
+ keywords = Some(
301
+ value_str
302
+ .split(',')
303
+ .map(|k| k.trim().to_string())
304
+ .filter(|k| !k.is_empty())
305
+ .collect(),
306
+ )
307
+ }
308
+ "meta-author" => author = Some(value_str),
297
309
  "meta-og-title" | "meta-og:title" => metadata.og_title = Some(value_str),
298
310
  "meta-og-description" | "meta-og:description" => metadata.og_description = Some(value_str),
299
311
  "meta-og-image" | "meta-og:image" => metadata.og_image = Some(value_str),
@@ -319,10 +331,10 @@ pub fn parse_html_metadata(markdown: &str) -> Result<(Option<HtmlMetadata>, Stri
319
331
  }
320
332
  }
321
333
 
322
- let has_metadata = metadata.title.is_some()
323
- || metadata.description.is_some()
324
- || metadata.keywords.is_some()
325
- || metadata.author.is_some()
334
+ let has_metadata = title.is_some()
335
+ || description.is_some()
336
+ || keywords.is_some()
337
+ || author.is_some()
326
338
  || metadata.canonical.is_some()
327
339
  || metadata.base_href.is_some()
328
340
  || metadata.og_title.is_some()
@@ -331,6 +343,10 @@ pub fn parse_html_metadata(markdown: &str) -> Result<(Option<HtmlMetadata>, Stri
331
343
  || metadata.twitter_card.is_some();
332
344
 
333
345
  if has_metadata {
346
+ metadata.title = title;
347
+ metadata.description = description;
348
+ metadata.keywords = keywords.map(|kws| kws.join(", "));
349
+ metadata.author = author;
334
350
  Ok((Some(metadata), remaining_content.to_string()))
335
351
  } else {
336
352
  Ok((None, remaining_content.to_string()))
@@ -1,6 +1,7 @@
1
1
  //! Image extraction functionality.
2
2
  //!
3
- //! This module provides functions for extracting metadata and EXIF data from images.
3
+ //! This module provides functions for extracting metadata and EXIF data from images,
4
+ //! including support for multi-frame TIFF files.
4
5
 
5
6
  use crate::error::{KreuzbergError, Result};
6
7
  use exif::{In, Reader, Tag};
@@ -94,6 +95,128 @@ fn extract_exif_data(bytes: &[u8]) -> HashMap<String, String> {
94
95
  exif_map
95
96
  }
96
97
 
98
+ /// Result of OCR extraction from an image with optional page tracking.
99
+ #[derive(Debug, Clone)]
100
+ pub struct ImageOcrResult {
101
+ /// Extracted text content
102
+ pub content: String,
103
+ /// Character byte boundaries per frame (for multi-frame TIFFs)
104
+ pub boundaries: Option<Vec<crate::types::PageBoundary>>,
105
+ /// Per-frame content information
106
+ pub page_contents: Option<Vec<crate::types::PageContent>>,
107
+ }
108
+
109
+ /// Detects the number of frames in a TIFF file.
110
+ ///
111
+ /// Returns the count of image frames/pages in a TIFF. Single-frame TIFFs return 1.
112
+ /// Invalid or non-TIFF data returns an error.
113
+ ///
114
+ /// # Arguments
115
+ /// * `bytes` - Raw TIFF file bytes
116
+ ///
117
+ /// # Returns
118
+ /// Frame count if valid TIFF, error otherwise.
119
+ #[cfg(feature = "ocr")]
120
+ fn detect_tiff_frame_count(bytes: &[u8]) -> Result<usize> {
121
+ use tiff::decoder::Decoder;
122
+ let mut decoder =
123
+ Decoder::new(Cursor::new(bytes)).map_err(|e| KreuzbergError::parsing(format!("TIFF decode: {}", e)))?;
124
+
125
+ let mut count = 1;
126
+ while decoder.next_image().is_ok() {
127
+ count += 1;
128
+ }
129
+ Ok(count)
130
+ }
131
+
132
+ /// Extract text from image bytes using OCR with optional page tracking for multi-frame TIFFs.
133
+ ///
134
+ /// This function:
135
+ /// - Detects if the image is a multi-frame TIFF
136
+ /// - For multi-frame TIFFs with PageConfig enabled, iterates frames and tracks boundaries
137
+ /// - For single-frame images or when page tracking is disabled, runs OCR on the whole image
138
+ /// - Returns (content, boundaries, page_contents) tuple
139
+ ///
140
+ /// # Arguments
141
+ /// * `bytes` - Image file bytes
142
+ /// * `mime_type` - MIME type (e.g., "image/tiff")
143
+ /// * `ocr_result` - OCR backend result containing the text
144
+ /// * `page_config` - Optional page configuration for boundary tracking
145
+ ///
146
+ /// # Returns
147
+ /// ImageOcrResult with content and optional boundaries for pagination
148
+ #[cfg(feature = "ocr")]
149
+ pub fn extract_text_from_image_with_ocr(
150
+ bytes: &[u8],
151
+ mime_type: &str,
152
+ ocr_result: String,
153
+ page_config: Option<&crate::core::config::PageConfig>,
154
+ ) -> Result<ImageOcrResult> {
155
+ let is_tiff = mime_type.to_lowercase().contains("tiff");
156
+ let should_track_pages = page_config.is_some() && is_tiff;
157
+
158
+ if !should_track_pages {
159
+ return Ok(ImageOcrResult {
160
+ content: ocr_result,
161
+ boundaries: None,
162
+ page_contents: None,
163
+ });
164
+ }
165
+
166
+ let frame_count = detect_tiff_frame_count(bytes)?;
167
+
168
+ if frame_count <= 1 {
169
+ return Ok(ImageOcrResult {
170
+ content: ocr_result,
171
+ boundaries: None,
172
+ page_contents: None,
173
+ });
174
+ }
175
+
176
+ let content_len = ocr_result.len();
177
+ let content_per_frame = if frame_count > 0 {
178
+ content_len / frame_count
179
+ } else {
180
+ content_len
181
+ };
182
+
183
+ let mut boundaries = Vec::new();
184
+ let mut page_contents = Vec::new();
185
+ let mut byte_offset = 0;
186
+
187
+ for frame_num in 1..=frame_count {
188
+ let frame_end = if frame_num == frame_count {
189
+ content_len
190
+ } else {
191
+ let raw_end = (frame_num * content_per_frame).min(content_len);
192
+ (raw_end..=content_len)
193
+ .find(|&i| ocr_result.is_char_boundary(i))
194
+ .unwrap_or(content_len)
195
+ };
196
+
197
+ boundaries.push(crate::types::PageBoundary {
198
+ byte_start: byte_offset,
199
+ byte_end: frame_end,
200
+ page_number: frame_num,
201
+ });
202
+
203
+ page_contents.push(crate::types::PageContent {
204
+ page_number: frame_num,
205
+ content: ocr_result[byte_offset..frame_end].to_string(),
206
+ tables: vec![],
207
+ images: vec![],
208
+ });
209
+
210
+ byte_offset = frame_end;
211
+ }
212
+
213
+ Ok(ImageOcrResult {
214
+ content: ocr_result,
215
+ boundaries: Some(boundaries),
216
+ page_contents: Some(page_contents),
217
+ })
218
+ }
219
+
97
220
  #[cfg(test)]
98
221
  mod tests {
99
222
  use super::*;
@@ -462,8 +462,7 @@ mod tests {
462
462
  async fn test_check_libreoffice_missing_dependency_error() {
463
463
  let result = check_libreoffice_available().await;
464
464
 
465
- if result.is_err() {
466
- let err = result.unwrap_err();
465
+ if let Err(err) = result {
467
466
  match err {
468
467
  KreuzbergError::MissingDependency(msg) => {
469
468
  assert!(msg.contains("LibreOffice") || msg.contains("soffice"));
@@ -104,7 +104,6 @@ pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<
104
104
 
105
105
  let root = doc.root_element();
106
106
 
107
- // Extract Dublin Core elements
108
107
  let title = super::parse_xml_text(root, "title");
109
108
  let subject = super::parse_xml_text(root, "subject");
110
109
  let creator = super::parse_xml_text(root, "creator");
@@ -112,7 +111,6 @@ pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<
112
111
  let language = super::parse_xml_text(root, "language");
113
112
  let date = super::parse_xml_text(root, "date");
114
113
 
115
- // Extract OpenDocument meta elements
116
114
  let initial_creator = super::parse_xml_text(root, "initial-creator");
117
115
  let keywords = super::parse_xml_text(root, "keyword");
118
116
  let creation_date = super::parse_xml_text(root, "creation-date");
@@ -120,7 +118,6 @@ pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<
120
118
  let editing_duration = super::parse_xml_text(root, "editing-duration");
121
119
  let editing_cycles = super::parse_xml_text(root, "editing-cycles");
122
120
 
123
- // Extract document statistics
124
121
  let page_count = super::parse_xml_int(root, "page-count");
125
122
  let word_count = super::parse_xml_int(root, "word-count");
126
123
  let character_count = super::parse_xml_int(root, "character-count");