kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -7
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  19. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  20. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  21. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  22. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  23. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  24. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  25. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  26. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  27. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  28. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  29. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  30. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  31. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. metadata +4 -2
@@ -88,6 +88,9 @@ pub mod html;
88
88
  #[cfg(feature = "office")]
89
89
  pub mod bibtex;
90
90
 
91
+ #[cfg(feature = "office")]
92
+ pub mod citation;
93
+
91
94
  #[cfg(all(feature = "tokio-runtime", feature = "office"))]
92
95
  pub mod docx;
93
96
 
@@ -146,7 +149,7 @@ pub use text::{MarkdownExtractor, PlainTextExtractor};
146
149
  pub use image::ImageExtractor;
147
150
 
148
151
  #[cfg(feature = "archives")]
149
- pub use archive::{SevenZExtractor, TarExtractor, ZipExtractor};
152
+ pub use archive::{GzipExtractor, SevenZExtractor, TarExtractor, ZipExtractor};
150
153
 
151
154
  #[cfg(feature = "email")]
152
155
  pub use email::EmailExtractor;
@@ -160,6 +163,9 @@ pub use html::HtmlExtractor;
160
163
  #[cfg(feature = "office")]
161
164
  pub use bibtex::BibtexExtractor;
162
165
 
166
+ #[cfg(feature = "office")]
167
+ pub use citation::CitationExtractor;
168
+
163
169
  #[cfg(all(feature = "tokio-runtime", feature = "office"))]
164
170
  pub use docx::DocxExtractor;
165
171
 
@@ -278,7 +284,11 @@ pub fn register_default_extractors() -> Result<()> {
278
284
  registry.register(Arc::new(ImageExtractor::new()))?;
279
285
 
280
286
  #[cfg(feature = "xml")]
281
- registry.register(Arc::new(XmlExtractor::new()))?;
287
+ {
288
+ registry.register(Arc::new(XmlExtractor::new()))?;
289
+ registry.register(Arc::new(JatsExtractor::new()))?;
290
+ registry.register(Arc::new(DocbookExtractor::new()))?;
291
+ }
282
292
 
283
293
  #[cfg(feature = "pdf")]
284
294
  registry.register(Arc::new(PdfExtractor::new()))?;
@@ -292,6 +302,7 @@ pub fn register_default_extractors() -> Result<()> {
292
302
  {
293
303
  registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
294
304
  registry.register(Arc::new(BibtexExtractor::new()))?;
305
+ registry.register(Arc::new(CitationExtractor::new()))?;
295
306
  registry.register(Arc::new(EpubExtractor::new()))?;
296
307
  registry.register(Arc::new(FictionBookExtractor::new()))?;
297
308
  registry.register(Arc::new(RtfExtractor::new()))?;
@@ -321,6 +332,7 @@ pub fn register_default_extractors() -> Result<()> {
321
332
  registry.register(Arc::new(ZipExtractor::new()))?;
322
333
  registry.register(Arc::new(TarExtractor::new()))?;
323
334
  registry.register(Arc::new(SevenZExtractor::new()))?;
335
+ registry.register(Arc::new(GzipExtractor::new()))?;
324
336
  }
325
337
 
326
338
  Ok(())
@@ -362,8 +374,10 @@ mod tests {
362
374
 
363
375
  #[cfg(feature = "xml")]
364
376
  {
365
- expected_count += 1;
377
+ expected_count += 3;
366
378
  assert!(extractor_names.contains(&"xml-extractor".to_string()));
379
+ assert!(extractor_names.contains(&"jats-extractor".to_string()));
380
+ assert!(extractor_names.contains(&"docbook-extractor".to_string()));
367
381
  }
368
382
 
369
383
  #[cfg(feature = "pdf")]
@@ -380,9 +394,10 @@ mod tests {
380
394
 
381
395
  #[cfg(feature = "office")]
382
396
  {
383
- expected_count += 10;
397
+ expected_count += 11;
384
398
  assert!(extractor_names.contains(&"markdown-extractor".to_string()));
385
399
  assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
400
+ assert!(extractor_names.contains(&"citation-extractor".to_string()));
386
401
  assert!(extractor_names.contains(&"epub-extractor".to_string()));
387
402
  assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
388
403
  assert!(extractor_names.contains(&"rtf-extractor".to_string()));
@@ -416,10 +431,11 @@ mod tests {
416
431
 
417
432
  #[cfg(feature = "archives")]
418
433
  {
419
- expected_count += 3;
434
+ expected_count += 4;
420
435
  assert!(extractor_names.contains(&"zip-extractor".to_string()));
421
436
  assert!(extractor_names.contains(&"tar-extractor".to_string()));
422
437
  assert!(extractor_names.contains(&"7z-extractor".to_string()));
438
+ assert!(extractor_names.contains(&"gzip-extractor".to_string()));
423
439
  }
424
440
 
425
441
  assert_eq!(
@@ -95,7 +95,7 @@ impl DocumentExtractor for OpmlExtractor {
95
95
  }
96
96
 
97
97
  fn supported_mime_types(&self) -> &[&str] {
98
- &["text/x-opml", "application/xml+opml"]
98
+ &["text/x-opml", "application/xml+opml", "application/x-opml+xml"]
99
99
  }
100
100
 
101
101
  fn priority(&self) -> i32 {
@@ -135,6 +135,7 @@ mod tests {
135
135
  let supported = extractor.supported_mime_types();
136
136
  assert!(supported.contains(&"text/x-opml"));
137
137
  assert!(supported.contains(&"application/xml+opml"));
138
+ assert!(supported.contains(&"application/x-opml+xml"));
138
139
  }
139
140
 
140
141
  #[tokio::test]
@@ -14,7 +14,8 @@ use std::io::{Read, Seek};
14
14
  ///
15
15
  /// All limits are intentionally conservative to prevent DoS attacks
16
16
  /// while still supporting legitimate documents.
17
- #[derive(Clone, Debug)]
17
+ #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
18
+ #[serde(default)]
18
19
  pub struct SecurityLimits {
19
20
  /// Maximum uncompressed size for archives (500 MB)
20
21
  pub max_archive_size: usize,
@@ -59,8 +59,12 @@ impl DocumentExtractor for StructuredExtractor {
59
59
  _config: &ExtractionConfig,
60
60
  ) -> Result<ExtractionResult> {
61
61
  let structured_result = match mime_type {
62
- "application/json" | "text/json" => crate::extraction::structured::parse_json(content, None)?,
63
- "application/x-yaml" | "text/yaml" | "text/x-yaml" => crate::extraction::structured::parse_yaml(content)?,
62
+ "application/json" | "text/json" | "application/csl+json" => {
63
+ crate::extraction::structured::parse_json(content, None)?
64
+ }
65
+ "application/yaml" | "application/x-yaml" | "text/yaml" | "text/x-yaml" => {
66
+ crate::extraction::structured::parse_yaml(content)?
67
+ }
64
68
  "application/toml" | "text/toml" => crate::extraction::structured::parse_toml(content)?,
65
69
  _ => return Err(crate::KreuzbergError::UnsupportedFormat(mime_type.to_string())),
66
70
  };
@@ -112,6 +116,8 @@ impl DocumentExtractor for StructuredExtractor {
112
116
  &[
113
117
  "application/json",
114
118
  "text/json",
119
+ "application/csl+json",
120
+ "application/yaml",
115
121
  "application/x-yaml",
116
122
  "text/yaml",
117
123
  "text/x-yaml",
@@ -141,9 +147,10 @@ mod tests {
141
147
  fn test_structured_extractor_supported_mime_types() {
142
148
  let extractor = StructuredExtractor::new();
143
149
  let mime_types = extractor.supported_mime_types();
144
- assert_eq!(mime_types.len(), 7);
150
+ assert_eq!(mime_types.len(), 9);
145
151
  assert!(mime_types.contains(&"application/json"));
146
152
  assert!(mime_types.contains(&"application/x-yaml"));
147
153
  assert!(mime_types.contains(&"application/toml"));
154
+ assert!(mime_types.contains(&"application/csl+json"));
148
155
  }
149
156
  }
@@ -97,7 +97,15 @@ impl DocumentExtractor for PlainTextExtractor {
97
97
  }
98
98
 
99
99
  fn supported_mime_types(&self) -> &[&str] {
100
- &["text/plain", "text/csv", "text/tab-separated-values"]
100
+ &[
101
+ "text/plain",
102
+ "text/csv",
103
+ "text/tab-separated-values",
104
+ "text/troff",
105
+ "text/x-mdoc",
106
+ "text/x-pod",
107
+ "text/x-dokuwiki",
108
+ ]
101
109
  }
102
110
 
103
111
  fn priority(&self) -> i32 {
@@ -192,7 +200,12 @@ impl DocumentExtractor for MarkdownExtractor {
192
200
  }
193
201
 
194
202
  fn supported_mime_types(&self) -> &[&str] {
195
- &["text/markdown", "text/x-markdown"]
203
+ &[
204
+ "text/markdown",
205
+ "text/x-markdown",
206
+ "text/x-markdown-extra",
207
+ "text/x-multimarkdown",
208
+ ]
196
209
  }
197
210
 
198
211
  fn priority(&self) -> i32 {
@@ -253,7 +266,15 @@ mod tests {
253
266
  assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
254
267
  assert_eq!(
255
268
  extractor.supported_mime_types(),
256
- &["text/plain", "text/csv", "text/tab-separated-values"]
269
+ &[
270
+ "text/plain",
271
+ "text/csv",
272
+ "text/tab-separated-values",
273
+ "text/troff",
274
+ "text/x-mdoc",
275
+ "text/x-pod",
276
+ "text/x-dokuwiki",
277
+ ]
257
278
  );
258
279
  assert_eq!(extractor.priority(), 50);
259
280
  }
@@ -263,7 +284,15 @@ mod tests {
263
284
  let extractor = MarkdownExtractor::new();
264
285
  assert_eq!(extractor.name(), "markdown-extractor");
265
286
  assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
266
- assert_eq!(extractor.supported_mime_types(), &["text/markdown", "text/x-markdown"]);
287
+ assert_eq!(
288
+ extractor.supported_mime_types(),
289
+ &[
290
+ "text/markdown",
291
+ "text/x-markdown",
292
+ "text/x-markdown-extra",
293
+ "text/x-multimarkdown"
294
+ ]
295
+ );
267
296
  assert_eq!(extractor.priority(), 50);
268
297
  }
269
298
  }
@@ -96,7 +96,12 @@ impl DocumentExtractor for XmlExtractor {
96
96
  }
97
97
 
98
98
  fn supported_mime_types(&self) -> &[&str] {
99
- &["application/xml", "text/xml", "image/svg+xml"]
99
+ &[
100
+ "application/xml",
101
+ "text/xml",
102
+ "image/svg+xml",
103
+ "application/x-endnote+xml",
104
+ ]
100
105
  }
101
106
 
102
107
  fn priority(&self) -> i32 {
@@ -142,7 +147,12 @@ mod tests {
142
147
  assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
143
148
  assert_eq!(
144
149
  extractor.supported_mime_types(),
145
- &["application/xml", "text/xml", "image/svg+xml"]
150
+ &[
151
+ "application/xml",
152
+ "text/xml",
153
+ "image/svg+xml",
154
+ "application/x-endnote+xml"
155
+ ]
146
156
  );
147
157
  assert_eq!(extractor.priority(), 50);
148
158
  }
@@ -72,8 +72,21 @@ pub(super) fn perform_ocr(
72
72
  )
73
73
  });
74
74
 
75
- let img = image::load_from_memory(image_bytes)
76
- .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode image: {}", e)))?;
75
+ let img = {
76
+ // Check for JPEG 2000 format which the image crate doesn't support
77
+ if crate::extraction::image::is_jp2(image_bytes) || crate::extraction::image::is_j2k(image_bytes) {
78
+ crate::extraction::image::decode_jp2_to_rgb(image_bytes)
79
+ .map(image::DynamicImage::ImageRgb8)
80
+ .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode JP2 image: {}", e)))?
81
+ } else if crate::extraction::image::is_jbig2(image_bytes) {
82
+ crate::extraction::image::decode_jbig2_to_gray(image_bytes)
83
+ .map(image::DynamicImage::ImageLuma8)
84
+ .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode JBIG2 image: {}", e)))?
85
+ } else {
86
+ image::load_from_memory(image_bytes)
87
+ .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode image: {}", e)))?
88
+ }
89
+ };
77
90
 
78
91
  let rgb_image = img.to_rgb8();
79
92
  let (width, height) = rgb_image.dimensions();
@@ -224,7 +237,7 @@ pub(super) fn perform_ocr(
224
237
  "tsv" => {
225
238
  let tsv = tsv_data_for_tables
226
239
  .as_ref()
227
- .expect("TSV data should be extracted when output_format is 'tsv'")
240
+ .ok_or_else(|| OcrError::ProcessingFailed("TSV data not available".to_string()))?
228
241
  .clone();
229
242
  (tsv, "text/plain".to_string())
230
243
  }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.12"
3
+ version = "4.2.13"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.12
4
+ version: 4.2.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-06 00:00:00.000000000 Z
11
+ date: 2026-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -422,6 +422,7 @@ files:
422
422
  - vendor/kreuzberg/src/core/server_config/validation.rs
423
423
  - vendor/kreuzberg/src/embeddings.rs
424
424
  - vendor/kreuzberg/src/error.rs
425
+ - vendor/kreuzberg/src/extraction/archive/gzip.rs
425
426
  - vendor/kreuzberg/src/extraction/archive/mod.rs
426
427
  - vendor/kreuzberg/src/extraction/archive/sevenz.rs
427
428
  - vendor/kreuzberg/src/extraction/archive/tar.rs
@@ -463,6 +464,7 @@ files:
463
464
  - vendor/kreuzberg/src/extraction/xml.rs
464
465
  - vendor/kreuzberg/src/extractors/archive.rs
465
466
  - vendor/kreuzberg/src/extractors/bibtex.rs
467
+ - vendor/kreuzberg/src/extractors/citation.rs
466
468
  - vendor/kreuzberg/src/extractors/djot_format/attributes.rs
467
469
  - vendor/kreuzberg/src/extractors/djot_format/conversion.rs
468
470
  - vendor/kreuzberg/src/extractors/djot_format/extractor.rs