kreuzberg 4.2.12 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -7
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
|
@@ -88,6 +88,9 @@ pub mod html;
|
|
|
88
88
|
#[cfg(feature = "office")]
|
|
89
89
|
pub mod bibtex;
|
|
90
90
|
|
|
91
|
+
#[cfg(feature = "office")]
|
|
92
|
+
pub mod citation;
|
|
93
|
+
|
|
91
94
|
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
92
95
|
pub mod docx;
|
|
93
96
|
|
|
@@ -146,7 +149,7 @@ pub use text::{MarkdownExtractor, PlainTextExtractor};
|
|
|
146
149
|
pub use image::ImageExtractor;
|
|
147
150
|
|
|
148
151
|
#[cfg(feature = "archives")]
|
|
149
|
-
pub use archive::{SevenZExtractor, TarExtractor, ZipExtractor};
|
|
152
|
+
pub use archive::{GzipExtractor, SevenZExtractor, TarExtractor, ZipExtractor};
|
|
150
153
|
|
|
151
154
|
#[cfg(feature = "email")]
|
|
152
155
|
pub use email::EmailExtractor;
|
|
@@ -160,6 +163,9 @@ pub use html::HtmlExtractor;
|
|
|
160
163
|
#[cfg(feature = "office")]
|
|
161
164
|
pub use bibtex::BibtexExtractor;
|
|
162
165
|
|
|
166
|
+
#[cfg(feature = "office")]
|
|
167
|
+
pub use citation::CitationExtractor;
|
|
168
|
+
|
|
163
169
|
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
164
170
|
pub use docx::DocxExtractor;
|
|
165
171
|
|
|
@@ -278,7 +284,11 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
278
284
|
registry.register(Arc::new(ImageExtractor::new()))?;
|
|
279
285
|
|
|
280
286
|
#[cfg(feature = "xml")]
|
|
281
|
-
|
|
287
|
+
{
|
|
288
|
+
registry.register(Arc::new(XmlExtractor::new()))?;
|
|
289
|
+
registry.register(Arc::new(JatsExtractor::new()))?;
|
|
290
|
+
registry.register(Arc::new(DocbookExtractor::new()))?;
|
|
291
|
+
}
|
|
282
292
|
|
|
283
293
|
#[cfg(feature = "pdf")]
|
|
284
294
|
registry.register(Arc::new(PdfExtractor::new()))?;
|
|
@@ -292,6 +302,7 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
292
302
|
{
|
|
293
303
|
registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
|
|
294
304
|
registry.register(Arc::new(BibtexExtractor::new()))?;
|
|
305
|
+
registry.register(Arc::new(CitationExtractor::new()))?;
|
|
295
306
|
registry.register(Arc::new(EpubExtractor::new()))?;
|
|
296
307
|
registry.register(Arc::new(FictionBookExtractor::new()))?;
|
|
297
308
|
registry.register(Arc::new(RtfExtractor::new()))?;
|
|
@@ -321,6 +332,7 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
321
332
|
registry.register(Arc::new(ZipExtractor::new()))?;
|
|
322
333
|
registry.register(Arc::new(TarExtractor::new()))?;
|
|
323
334
|
registry.register(Arc::new(SevenZExtractor::new()))?;
|
|
335
|
+
registry.register(Arc::new(GzipExtractor::new()))?;
|
|
324
336
|
}
|
|
325
337
|
|
|
326
338
|
Ok(())
|
|
@@ -362,8 +374,10 @@ mod tests {
|
|
|
362
374
|
|
|
363
375
|
#[cfg(feature = "xml")]
|
|
364
376
|
{
|
|
365
|
-
expected_count +=
|
|
377
|
+
expected_count += 3;
|
|
366
378
|
assert!(extractor_names.contains(&"xml-extractor".to_string()));
|
|
379
|
+
assert!(extractor_names.contains(&"jats-extractor".to_string()));
|
|
380
|
+
assert!(extractor_names.contains(&"docbook-extractor".to_string()));
|
|
367
381
|
}
|
|
368
382
|
|
|
369
383
|
#[cfg(feature = "pdf")]
|
|
@@ -380,9 +394,10 @@ mod tests {
|
|
|
380
394
|
|
|
381
395
|
#[cfg(feature = "office")]
|
|
382
396
|
{
|
|
383
|
-
expected_count +=
|
|
397
|
+
expected_count += 11;
|
|
384
398
|
assert!(extractor_names.contains(&"markdown-extractor".to_string()));
|
|
385
399
|
assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
|
|
400
|
+
assert!(extractor_names.contains(&"citation-extractor".to_string()));
|
|
386
401
|
assert!(extractor_names.contains(&"epub-extractor".to_string()));
|
|
387
402
|
assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
|
|
388
403
|
assert!(extractor_names.contains(&"rtf-extractor".to_string()));
|
|
@@ -416,10 +431,11 @@ mod tests {
|
|
|
416
431
|
|
|
417
432
|
#[cfg(feature = "archives")]
|
|
418
433
|
{
|
|
419
|
-
expected_count +=
|
|
434
|
+
expected_count += 4;
|
|
420
435
|
assert!(extractor_names.contains(&"zip-extractor".to_string()));
|
|
421
436
|
assert!(extractor_names.contains(&"tar-extractor".to_string()));
|
|
422
437
|
assert!(extractor_names.contains(&"7z-extractor".to_string()));
|
|
438
|
+
assert!(extractor_names.contains(&"gzip-extractor".to_string()));
|
|
423
439
|
}
|
|
424
440
|
|
|
425
441
|
assert_eq!(
|
|
@@ -95,7 +95,7 @@ impl DocumentExtractor for OpmlExtractor {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
fn supported_mime_types(&self) -> &[&str] {
|
|
98
|
-
&["text/x-opml", "application/xml+opml"]
|
|
98
|
+
&["text/x-opml", "application/xml+opml", "application/x-opml+xml"]
|
|
99
99
|
}
|
|
100
100
|
|
|
101
101
|
fn priority(&self) -> i32 {
|
|
@@ -135,6 +135,7 @@ mod tests {
|
|
|
135
135
|
let supported = extractor.supported_mime_types();
|
|
136
136
|
assert!(supported.contains(&"text/x-opml"));
|
|
137
137
|
assert!(supported.contains(&"application/xml+opml"));
|
|
138
|
+
assert!(supported.contains(&"application/x-opml+xml"));
|
|
138
139
|
}
|
|
139
140
|
|
|
140
141
|
#[tokio::test]
|
|
@@ -14,7 +14,8 @@ use std::io::{Read, Seek};
|
|
|
14
14
|
///
|
|
15
15
|
/// All limits are intentionally conservative to prevent DoS attacks
|
|
16
16
|
/// while still supporting legitimate documents.
|
|
17
|
-
#[derive(Clone, Debug)]
|
|
17
|
+
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
|
|
18
|
+
#[serde(default)]
|
|
18
19
|
pub struct SecurityLimits {
|
|
19
20
|
/// Maximum uncompressed size for archives (500 MB)
|
|
20
21
|
pub max_archive_size: usize,
|
|
@@ -59,8 +59,12 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
59
59
|
_config: &ExtractionConfig,
|
|
60
60
|
) -> Result<ExtractionResult> {
|
|
61
61
|
let structured_result = match mime_type {
|
|
62
|
-
"application/json" | "text/json" =>
|
|
63
|
-
|
|
62
|
+
"application/json" | "text/json" | "application/csl+json" => {
|
|
63
|
+
crate::extraction::structured::parse_json(content, None)?
|
|
64
|
+
}
|
|
65
|
+
"application/yaml" | "application/x-yaml" | "text/yaml" | "text/x-yaml" => {
|
|
66
|
+
crate::extraction::structured::parse_yaml(content)?
|
|
67
|
+
}
|
|
64
68
|
"application/toml" | "text/toml" => crate::extraction::structured::parse_toml(content)?,
|
|
65
69
|
_ => return Err(crate::KreuzbergError::UnsupportedFormat(mime_type.to_string())),
|
|
66
70
|
};
|
|
@@ -112,6 +116,8 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
112
116
|
&[
|
|
113
117
|
"application/json",
|
|
114
118
|
"text/json",
|
|
119
|
+
"application/csl+json",
|
|
120
|
+
"application/yaml",
|
|
115
121
|
"application/x-yaml",
|
|
116
122
|
"text/yaml",
|
|
117
123
|
"text/x-yaml",
|
|
@@ -141,9 +147,10 @@ mod tests {
|
|
|
141
147
|
fn test_structured_extractor_supported_mime_types() {
|
|
142
148
|
let extractor = StructuredExtractor::new();
|
|
143
149
|
let mime_types = extractor.supported_mime_types();
|
|
144
|
-
assert_eq!(mime_types.len(),
|
|
150
|
+
assert_eq!(mime_types.len(), 9);
|
|
145
151
|
assert!(mime_types.contains(&"application/json"));
|
|
146
152
|
assert!(mime_types.contains(&"application/x-yaml"));
|
|
147
153
|
assert!(mime_types.contains(&"application/toml"));
|
|
154
|
+
assert!(mime_types.contains(&"application/csl+json"));
|
|
148
155
|
}
|
|
149
156
|
}
|
|
@@ -97,7 +97,15 @@ impl DocumentExtractor for PlainTextExtractor {
|
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
fn supported_mime_types(&self) -> &[&str] {
|
|
100
|
-
&[
|
|
100
|
+
&[
|
|
101
|
+
"text/plain",
|
|
102
|
+
"text/csv",
|
|
103
|
+
"text/tab-separated-values",
|
|
104
|
+
"text/troff",
|
|
105
|
+
"text/x-mdoc",
|
|
106
|
+
"text/x-pod",
|
|
107
|
+
"text/x-dokuwiki",
|
|
108
|
+
]
|
|
101
109
|
}
|
|
102
110
|
|
|
103
111
|
fn priority(&self) -> i32 {
|
|
@@ -192,7 +200,12 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
192
200
|
}
|
|
193
201
|
|
|
194
202
|
fn supported_mime_types(&self) -> &[&str] {
|
|
195
|
-
&[
|
|
203
|
+
&[
|
|
204
|
+
"text/markdown",
|
|
205
|
+
"text/x-markdown",
|
|
206
|
+
"text/x-markdown-extra",
|
|
207
|
+
"text/x-multimarkdown",
|
|
208
|
+
]
|
|
196
209
|
}
|
|
197
210
|
|
|
198
211
|
fn priority(&self) -> i32 {
|
|
@@ -253,7 +266,15 @@ mod tests {
|
|
|
253
266
|
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
254
267
|
assert_eq!(
|
|
255
268
|
extractor.supported_mime_types(),
|
|
256
|
-
&[
|
|
269
|
+
&[
|
|
270
|
+
"text/plain",
|
|
271
|
+
"text/csv",
|
|
272
|
+
"text/tab-separated-values",
|
|
273
|
+
"text/troff",
|
|
274
|
+
"text/x-mdoc",
|
|
275
|
+
"text/x-pod",
|
|
276
|
+
"text/x-dokuwiki",
|
|
277
|
+
]
|
|
257
278
|
);
|
|
258
279
|
assert_eq!(extractor.priority(), 50);
|
|
259
280
|
}
|
|
@@ -263,7 +284,15 @@ mod tests {
|
|
|
263
284
|
let extractor = MarkdownExtractor::new();
|
|
264
285
|
assert_eq!(extractor.name(), "markdown-extractor");
|
|
265
286
|
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
266
|
-
assert_eq!(
|
|
287
|
+
assert_eq!(
|
|
288
|
+
extractor.supported_mime_types(),
|
|
289
|
+
&[
|
|
290
|
+
"text/markdown",
|
|
291
|
+
"text/x-markdown",
|
|
292
|
+
"text/x-markdown-extra",
|
|
293
|
+
"text/x-multimarkdown"
|
|
294
|
+
]
|
|
295
|
+
);
|
|
267
296
|
assert_eq!(extractor.priority(), 50);
|
|
268
297
|
}
|
|
269
298
|
}
|
|
@@ -96,7 +96,12 @@ impl DocumentExtractor for XmlExtractor {
|
|
|
96
96
|
}
|
|
97
97
|
|
|
98
98
|
fn supported_mime_types(&self) -> &[&str] {
|
|
99
|
-
&[
|
|
99
|
+
&[
|
|
100
|
+
"application/xml",
|
|
101
|
+
"text/xml",
|
|
102
|
+
"image/svg+xml",
|
|
103
|
+
"application/x-endnote+xml",
|
|
104
|
+
]
|
|
100
105
|
}
|
|
101
106
|
|
|
102
107
|
fn priority(&self) -> i32 {
|
|
@@ -142,7 +147,12 @@ mod tests {
|
|
|
142
147
|
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
143
148
|
assert_eq!(
|
|
144
149
|
extractor.supported_mime_types(),
|
|
145
|
-
&[
|
|
150
|
+
&[
|
|
151
|
+
"application/xml",
|
|
152
|
+
"text/xml",
|
|
153
|
+
"image/svg+xml",
|
|
154
|
+
"application/x-endnote+xml"
|
|
155
|
+
]
|
|
146
156
|
);
|
|
147
157
|
assert_eq!(extractor.priority(), 50);
|
|
148
158
|
}
|
|
@@ -72,8 +72,21 @@ pub(super) fn perform_ocr(
|
|
|
72
72
|
)
|
|
73
73
|
});
|
|
74
74
|
|
|
75
|
-
let img =
|
|
76
|
-
|
|
75
|
+
let img = {
|
|
76
|
+
// Check for JPEG 2000 format which the image crate doesn't support
|
|
77
|
+
if crate::extraction::image::is_jp2(image_bytes) || crate::extraction::image::is_j2k(image_bytes) {
|
|
78
|
+
crate::extraction::image::decode_jp2_to_rgb(image_bytes)
|
|
79
|
+
.map(image::DynamicImage::ImageRgb8)
|
|
80
|
+
.map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode JP2 image: {}", e)))?
|
|
81
|
+
} else if crate::extraction::image::is_jbig2(image_bytes) {
|
|
82
|
+
crate::extraction::image::decode_jbig2_to_gray(image_bytes)
|
|
83
|
+
.map(image::DynamicImage::ImageLuma8)
|
|
84
|
+
.map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode JBIG2 image: {}", e)))?
|
|
85
|
+
} else {
|
|
86
|
+
image::load_from_memory(image_bytes)
|
|
87
|
+
.map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode image: {}", e)))?
|
|
88
|
+
}
|
|
89
|
+
};
|
|
77
90
|
|
|
78
91
|
let rgb_image = img.to_rgb8();
|
|
79
92
|
let (width, height) = rgb_image.dimensions();
|
|
@@ -224,7 +237,7 @@ pub(super) fn perform_ocr(
|
|
|
224
237
|
"tsv" => {
|
|
225
238
|
let tsv = tsv_data_for_tables
|
|
226
239
|
.as_ref()
|
|
227
|
-
.
|
|
240
|
+
.ok_or_else(|| OcrError::ProcessingFailed("TSV data not available".to_string()))?
|
|
228
241
|
.clone();
|
|
229
242
|
(tsv, "text/plain".to_string())
|
|
230
243
|
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.13
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-07 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -422,6 +422,7 @@ files:
|
|
|
422
422
|
- vendor/kreuzberg/src/core/server_config/validation.rs
|
|
423
423
|
- vendor/kreuzberg/src/embeddings.rs
|
|
424
424
|
- vendor/kreuzberg/src/error.rs
|
|
425
|
+
- vendor/kreuzberg/src/extraction/archive/gzip.rs
|
|
425
426
|
- vendor/kreuzberg/src/extraction/archive/mod.rs
|
|
426
427
|
- vendor/kreuzberg/src/extraction/archive/sevenz.rs
|
|
427
428
|
- vendor/kreuzberg/src/extraction/archive/tar.rs
|
|
@@ -463,6 +464,7 @@ files:
|
|
|
463
464
|
- vendor/kreuzberg/src/extraction/xml.rs
|
|
464
465
|
- vendor/kreuzberg/src/extractors/archive.rs
|
|
465
466
|
- vendor/kreuzberg/src/extractors/bibtex.rs
|
|
467
|
+
- vendor/kreuzberg/src/extractors/citation.rs
|
|
466
468
|
- vendor/kreuzberg/src/extractors/djot_format/attributes.rs
|
|
467
469
|
- vendor/kreuzberg/src/extractors/djot_format/conversion.rs
|
|
468
470
|
- vendor/kreuzberg/src/extractors/djot_format/extractor.rs
|