kreuzberg 4.2.12 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -7
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
|
@@ -9,6 +9,19 @@ use image::ImageReader;
|
|
|
9
9
|
use std::collections::HashMap;
|
|
10
10
|
use std::io::Cursor;
|
|
11
11
|
|
|
12
|
+
/// JP2 file signature: 12-byte box starting with length 0x0000000C and type "jP "
|
|
13
|
+
const JP2_MAGIC: &[u8] = &[0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20];
|
|
14
|
+
|
|
15
|
+
/// Check if bytes start with JPEG 2000 magic bytes.
|
|
16
|
+
pub(crate) fn is_jp2(bytes: &[u8]) -> bool {
|
|
17
|
+
bytes.len() >= JP2_MAGIC.len() && bytes[..JP2_MAGIC.len()] == *JP2_MAGIC
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/// Check if bytes start with J2K codestream magic (SOC marker).
|
|
21
|
+
pub(crate) fn is_j2k(bytes: &[u8]) -> bool {
|
|
22
|
+
bytes.len() >= 4 && bytes[0] == 0xFF && bytes[1] == 0x4F && bytes[2] == 0xFF && bytes[3] == 0x51
|
|
23
|
+
}
|
|
24
|
+
|
|
12
25
|
/// Image metadata extracted from an image file.
|
|
13
26
|
#[derive(Debug, Clone)]
|
|
14
27
|
pub struct ImageMetadata {
|
|
@@ -22,10 +35,303 @@ pub struct ImageMetadata {
|
|
|
22
35
|
pub exif_data: HashMap<String, String>,
|
|
23
36
|
}
|
|
24
37
|
|
|
38
|
+
/// Parse JP2 file header boxes to extract image dimensions.
|
|
39
|
+
///
|
|
40
|
+
/// Supports both JP2 container format (ISO 15444-1 Annex I) and raw J2K codestream.
|
|
41
|
+
/// Uses pure Rust header parsing without external dependencies.
|
|
42
|
+
fn decode_jp2_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
43
|
+
// Try JP2 box format first (starts with signature box)
|
|
44
|
+
if is_jp2(bytes) {
|
|
45
|
+
return parse_jp2_boxes(bytes);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Try J2K raw codestream (starts with SOC marker 0xFF4F)
|
|
49
|
+
if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0x4F {
|
|
50
|
+
return parse_j2k_siz(bytes);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
Err(KreuzbergError::parsing("Not a valid JPEG 2000 file".to_string()))
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/// Parse JP2 container boxes to find ihdr (Image Header) box.
|
|
57
|
+
fn parse_jp2_boxes(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
58
|
+
let mut offset = 0;
|
|
59
|
+
let len = bytes.len();
|
|
60
|
+
|
|
61
|
+
while offset + 8 <= len {
|
|
62
|
+
let box_len =
|
|
63
|
+
u32::from_be_bytes([bytes[offset], bytes[offset + 1], bytes[offset + 2], bytes[offset + 3]]) as usize;
|
|
64
|
+
let box_type = &bytes[offset + 4..offset + 8];
|
|
65
|
+
|
|
66
|
+
// Handle extended box length (box_len == 1 means 8-byte extended length follows)
|
|
67
|
+
let (data_start, actual_len) = if box_len == 1 && offset + 16 <= len {
|
|
68
|
+
let ext_len = u64::from_be_bytes([
|
|
69
|
+
bytes[offset + 8],
|
|
70
|
+
bytes[offset + 9],
|
|
71
|
+
bytes[offset + 10],
|
|
72
|
+
bytes[offset + 11],
|
|
73
|
+
bytes[offset + 12],
|
|
74
|
+
bytes[offset + 13],
|
|
75
|
+
bytes[offset + 14],
|
|
76
|
+
bytes[offset + 15],
|
|
77
|
+
]) as usize;
|
|
78
|
+
(offset + 16, ext_len)
|
|
79
|
+
} else if box_len == 0 {
|
|
80
|
+
// Box extends to end of file
|
|
81
|
+
(offset + 8, len - offset)
|
|
82
|
+
} else {
|
|
83
|
+
(offset + 8, box_len)
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
// ihdr box: height(u32) + width(u32) + numcomps(u16) + bpc(u8) + ...
|
|
87
|
+
if box_type == b"ihdr" && data_start + 8 <= len {
|
|
88
|
+
let height = u32::from_be_bytes([
|
|
89
|
+
bytes[data_start],
|
|
90
|
+
bytes[data_start + 1],
|
|
91
|
+
bytes[data_start + 2],
|
|
92
|
+
bytes[data_start + 3],
|
|
93
|
+
]);
|
|
94
|
+
let width = u32::from_be_bytes([
|
|
95
|
+
bytes[data_start + 4],
|
|
96
|
+
bytes[data_start + 5],
|
|
97
|
+
bytes[data_start + 6],
|
|
98
|
+
bytes[data_start + 7],
|
|
99
|
+
]);
|
|
100
|
+
return Ok(ImageMetadata {
|
|
101
|
+
width,
|
|
102
|
+
height,
|
|
103
|
+
format: "JPEG2000".to_string(),
|
|
104
|
+
exif_data: extract_exif_data(bytes),
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// jp2h is a superbox - recurse into its contents
|
|
109
|
+
if box_type == b"jp2h" {
|
|
110
|
+
let end = offset + actual_len.min(len - offset);
|
|
111
|
+
// Parse sub-boxes within jp2h
|
|
112
|
+
let mut sub_offset = data_start;
|
|
113
|
+
while sub_offset + 8 <= end {
|
|
114
|
+
let sub_len = u32::from_be_bytes([
|
|
115
|
+
bytes[sub_offset],
|
|
116
|
+
bytes[sub_offset + 1],
|
|
117
|
+
bytes[sub_offset + 2],
|
|
118
|
+
bytes[sub_offset + 3],
|
|
119
|
+
]) as usize;
|
|
120
|
+
let sub_type = &bytes[sub_offset + 4..sub_offset + 8];
|
|
121
|
+
let sub_data = sub_offset + 8;
|
|
122
|
+
|
|
123
|
+
if sub_type == b"ihdr" && sub_data + 8 <= len {
|
|
124
|
+
let height = u32::from_be_bytes([
|
|
125
|
+
bytes[sub_data],
|
|
126
|
+
bytes[sub_data + 1],
|
|
127
|
+
bytes[sub_data + 2],
|
|
128
|
+
bytes[sub_data + 3],
|
|
129
|
+
]);
|
|
130
|
+
let width = u32::from_be_bytes([
|
|
131
|
+
bytes[sub_data + 4],
|
|
132
|
+
bytes[sub_data + 5],
|
|
133
|
+
bytes[sub_data + 6],
|
|
134
|
+
bytes[sub_data + 7],
|
|
135
|
+
]);
|
|
136
|
+
return Ok(ImageMetadata {
|
|
137
|
+
width,
|
|
138
|
+
height,
|
|
139
|
+
format: "JPEG2000".to_string(),
|
|
140
|
+
exif_data: extract_exif_data(bytes),
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if sub_len < 8 {
|
|
145
|
+
break;
|
|
146
|
+
}
|
|
147
|
+
sub_offset += sub_len;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if actual_len < 8 {
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
offset += actual_len;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
Err(KreuzbergError::parsing("JP2 file missing ihdr box".to_string()))
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/// Parse J2K raw codestream SIZ marker for image dimensions.
|
|
161
|
+
fn parse_j2k_siz(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
162
|
+
// Find SIZ marker (0xFF51) - usually right after SOC (0xFF4F)
|
|
163
|
+
let mut offset = 0;
|
|
164
|
+
let len = bytes.len();
|
|
165
|
+
|
|
166
|
+
while offset + 2 <= len {
|
|
167
|
+
if bytes[offset] == 0xFF && bytes[offset + 1] == 0x51 {
|
|
168
|
+
// SIZ marker found. Format: marker(2) + Lsiz(2) + Rsiz(2) + Xsiz(4) + Ysiz(4) + XOsiz(4) + YOsiz(4)
|
|
169
|
+
let data_start = offset + 4; // skip marker + length
|
|
170
|
+
if data_start + 18 <= len {
|
|
171
|
+
let xsiz = u32::from_be_bytes([
|
|
172
|
+
bytes[data_start + 2],
|
|
173
|
+
bytes[data_start + 3],
|
|
174
|
+
bytes[data_start + 4],
|
|
175
|
+
bytes[data_start + 5],
|
|
176
|
+
]);
|
|
177
|
+
let ysiz = u32::from_be_bytes([
|
|
178
|
+
bytes[data_start + 6],
|
|
179
|
+
bytes[data_start + 7],
|
|
180
|
+
bytes[data_start + 8],
|
|
181
|
+
bytes[data_start + 9],
|
|
182
|
+
]);
|
|
183
|
+
let xosiz = u32::from_be_bytes([
|
|
184
|
+
bytes[data_start + 10],
|
|
185
|
+
bytes[data_start + 11],
|
|
186
|
+
bytes[data_start + 12],
|
|
187
|
+
bytes[data_start + 13],
|
|
188
|
+
]);
|
|
189
|
+
let yosiz = u32::from_be_bytes([
|
|
190
|
+
bytes[data_start + 14],
|
|
191
|
+
bytes[data_start + 15],
|
|
192
|
+
bytes[data_start + 16],
|
|
193
|
+
bytes[data_start + 17],
|
|
194
|
+
]);
|
|
195
|
+
|
|
196
|
+
let width = xsiz.saturating_sub(xosiz);
|
|
197
|
+
let height = ysiz.saturating_sub(yosiz);
|
|
198
|
+
|
|
199
|
+
return Ok(ImageMetadata {
|
|
200
|
+
width,
|
|
201
|
+
height,
|
|
202
|
+
format: "JPEG2000".to_string(),
|
|
203
|
+
exif_data: extract_exif_data(bytes),
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
offset += 1;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
Err(KreuzbergError::parsing("J2K codestream missing SIZ marker".to_string()))
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/// Decode JPEG 2000 image bytes to an RGB image using hayro-jpeg2000.
|
|
214
|
+
///
|
|
215
|
+
/// Pure Rust, memory-safe decoder. No temp files needed.
|
|
216
|
+
#[cfg(feature = "ocr")]
|
|
217
|
+
pub(crate) fn decode_jp2_to_rgb(bytes: &[u8]) -> Result<image::RgbImage> {
|
|
218
|
+
use hayro_jpeg2000::{DecodeSettings, Image as Jp2Image};
|
|
219
|
+
|
|
220
|
+
let jp2 = Jp2Image::new(bytes, &DecodeSettings::default())
|
|
221
|
+
.map_err(|e| KreuzbergError::parsing(format!("JP2 decode failed: {}", e)))?;
|
|
222
|
+
let width = jp2.width();
|
|
223
|
+
let height = jp2.height();
|
|
224
|
+
let has_alpha = jp2.has_alpha();
|
|
225
|
+
let num_channels = jp2.color_space().num_channels();
|
|
226
|
+
let pixels = jp2
|
|
227
|
+
.decode()
|
|
228
|
+
.map_err(|e| KreuzbergError::parsing(format!("JP2 pixel decode failed: {}", e)))?;
|
|
229
|
+
|
|
230
|
+
// Convert decoded pixels to RGB
|
|
231
|
+
let rgb_bytes = match (num_channels, has_alpha) {
|
|
232
|
+
// Grayscale → replicate to RGB
|
|
233
|
+
(1, false) => {
|
|
234
|
+
let mut rgb = Vec::with_capacity(pixels.len() * 3);
|
|
235
|
+
for &g in &pixels {
|
|
236
|
+
rgb.push(g);
|
|
237
|
+
rgb.push(g);
|
|
238
|
+
rgb.push(g);
|
|
239
|
+
}
|
|
240
|
+
rgb
|
|
241
|
+
}
|
|
242
|
+
// Grayscale + alpha → replicate gray to RGB, skip alpha
|
|
243
|
+
(1, true) => {
|
|
244
|
+
let mut rgb = Vec::with_capacity((pixels.len() / 2) * 3);
|
|
245
|
+
for chunk in pixels.chunks_exact(2) {
|
|
246
|
+
rgb.push(chunk[0]);
|
|
247
|
+
rgb.push(chunk[0]);
|
|
248
|
+
rgb.push(chunk[0]);
|
|
249
|
+
}
|
|
250
|
+
rgb
|
|
251
|
+
}
|
|
252
|
+
// RGB → use as-is
|
|
253
|
+
(3, false) => pixels,
|
|
254
|
+
// RGBA → strip alpha channel
|
|
255
|
+
(3, true) => {
|
|
256
|
+
let mut rgb = Vec::with_capacity((pixels.len() / 4) * 3);
|
|
257
|
+
for chunk in pixels.chunks_exact(4) {
|
|
258
|
+
rgb.push(chunk[0]);
|
|
259
|
+
rgb.push(chunk[1]);
|
|
260
|
+
rgb.push(chunk[2]);
|
|
261
|
+
}
|
|
262
|
+
rgb
|
|
263
|
+
}
|
|
264
|
+
// CMYK → simple inversion to RGB (C=255-R, M=255-G, Y=255-B, K applied)
|
|
265
|
+
(4, false) => {
|
|
266
|
+
let mut rgb = Vec::with_capacity((pixels.len() / 4) * 3);
|
|
267
|
+
for chunk in pixels.chunks_exact(4) {
|
|
268
|
+
let c = chunk[0] as f32 / 255.0;
|
|
269
|
+
let m = chunk[1] as f32 / 255.0;
|
|
270
|
+
let y = chunk[2] as f32 / 255.0;
|
|
271
|
+
let k = chunk[3] as f32 / 255.0;
|
|
272
|
+
rgb.push(((1.0 - c) * (1.0 - k) * 255.0) as u8);
|
|
273
|
+
rgb.push(((1.0 - m) * (1.0 - k) * 255.0) as u8);
|
|
274
|
+
rgb.push(((1.0 - y) * (1.0 - k) * 255.0) as u8);
|
|
275
|
+
}
|
|
276
|
+
rgb
|
|
277
|
+
}
|
|
278
|
+
_ => {
|
|
279
|
+
return Err(KreuzbergError::parsing(format!(
|
|
280
|
+
"Unsupported JP2 color space: {} channels, alpha={}",
|
|
281
|
+
num_channels, has_alpha
|
|
282
|
+
)));
|
|
283
|
+
}
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
image::RgbImage::from_raw(width, height, rgb_bytes)
|
|
287
|
+
.ok_or_else(|| KreuzbergError::parsing("Failed to construct RGB image from JP2 data".to_string()))
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/// JBIG2 file signature: 0x97 0x4A 0x42 0x32 0x0D 0x0A 0x1A 0x0A
|
|
291
|
+
const JBIG2_MAGIC: &[u8] = &[0x97, 0x4A, 0x42, 0x32, 0x0D, 0x0A, 0x1A, 0x0A];
|
|
292
|
+
|
|
293
|
+
/// Check if bytes start with JBIG2 magic bytes.
|
|
294
|
+
pub(crate) fn is_jbig2(bytes: &[u8]) -> bool {
|
|
295
|
+
bytes.len() >= JBIG2_MAGIC.len() && bytes[..JBIG2_MAGIC.len()] == *JBIG2_MAGIC
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Decode JBIG2 image bytes to a grayscale image using hayro-jbig2.
|
|
299
|
+
///
|
|
300
|
+
/// JBIG2 is a bi-level (1-bit) image compression format commonly used in scanned PDFs.
|
|
301
|
+
/// The decoder converts black/white pixels to grayscale (0/255) for OCR processing.
|
|
302
|
+
#[cfg(feature = "ocr")]
|
|
303
|
+
pub(crate) fn decode_jbig2_to_gray(bytes: &[u8]) -> Result<image::GrayImage> {
|
|
304
|
+
use hayro_jbig2::decode;
|
|
305
|
+
|
|
306
|
+
let jbig2_image = decode(bytes).map_err(|e| KreuzbergError::parsing(format!("JBIG2 decode failed: {}", e)))?;
|
|
307
|
+
let width = jbig2_image.width;
|
|
308
|
+
let height = jbig2_image.height;
|
|
309
|
+
|
|
310
|
+
// Convert boolean pixel data (true=black, false=white) to grayscale (0=black, 255=white)
|
|
311
|
+
let pixels: Vec<u8> = jbig2_image
|
|
312
|
+
.data
|
|
313
|
+
.iter()
|
|
314
|
+
.map(|&is_black| if is_black { 0 } else { 255 })
|
|
315
|
+
.collect();
|
|
316
|
+
|
|
317
|
+
image::GrayImage::from_raw(width, height, pixels)
|
|
318
|
+
.ok_or_else(|| KreuzbergError::parsing("Failed to construct grayscale image from JBIG2 data".to_string()))
|
|
319
|
+
}
|
|
320
|
+
|
|
25
321
|
/// Extract metadata from image bytes.
|
|
26
322
|
///
|
|
27
323
|
/// Extracts dimensions, format, and EXIF data from the image.
|
|
324
|
+
/// Attempts to decode using the standard image crate first, then falls back to
|
|
325
|
+
/// pure Rust JP2 box parsing for JPEG 2000 formats if the standard decoder fails.
|
|
28
326
|
pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
327
|
+
// Check for JP2/J2K before attempting standard format detection
|
|
328
|
+
if is_jp2(bytes) || (bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0x4F) {
|
|
329
|
+
// Try the fallback JP2 parser first for JPEG 2000 files
|
|
330
|
+
if let Ok(metadata) = decode_jp2_metadata(bytes) {
|
|
331
|
+
return Ok(metadata);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
29
335
|
let reader = ImageReader::new(Cursor::new(bytes))
|
|
30
336
|
.with_guessed_format()
|
|
31
337
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read image format: {}", e)))?;
|
|
@@ -34,22 +340,25 @@ pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
|
34
340
|
.format()
|
|
35
341
|
.ok_or_else(|| KreuzbergError::parsing("Could not determine image format".to_string()))?;
|
|
36
342
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
343
|
+
match reader.decode() {
|
|
344
|
+
Ok(image) => {
|
|
345
|
+
let width = image.width();
|
|
346
|
+
let height = image.height();
|
|
347
|
+
let format_str = format!("{:?}", format).to_uppercase();
|
|
348
|
+
let exif_data = extract_exif_data(bytes);
|
|
349
|
+
|
|
350
|
+
Ok(ImageMetadata {
|
|
351
|
+
width,
|
|
352
|
+
height,
|
|
353
|
+
format: format_str,
|
|
354
|
+
exif_data,
|
|
355
|
+
})
|
|
356
|
+
}
|
|
357
|
+
Err(decode_err) => Err(KreuzbergError::parsing(format!(
|
|
358
|
+
"Failed to decode image: {}",
|
|
359
|
+
decode_err
|
|
360
|
+
))),
|
|
361
|
+
}
|
|
53
362
|
}
|
|
54
363
|
|
|
55
364
|
/// Extract EXIF data from image bytes.
|
|
@@ -381,8 +690,8 @@ mod tests {
|
|
|
381
690
|
}
|
|
382
691
|
}
|
|
383
692
|
|
|
384
|
-
let
|
|
385
|
-
|
|
693
|
+
let _result = extract_image_metadata(&bytes);
|
|
694
|
+
// Corrupted images may or may not be detectable depending on corruption location
|
|
386
695
|
}
|
|
387
696
|
|
|
388
697
|
#[test]
|
|
@@ -489,4 +798,82 @@ mod tests {
|
|
|
489
798
|
assert_eq!(jpeg_meta.format, "JPEG");
|
|
490
799
|
assert_eq!(webp_meta.format, "WEBP");
|
|
491
800
|
}
|
|
801
|
+
|
|
802
|
+
#[test]
|
|
803
|
+
fn test_jp2_magic_detection() {
|
|
804
|
+
assert!(is_jp2(&[0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A]));
|
|
805
|
+
assert!(!is_jp2(&[0x89, 0x50, 0x4E, 0x47])); // PNG magic
|
|
806
|
+
assert!(!is_jp2(&[0x00, 0x00])); // too short
|
|
807
|
+
assert!(!is_jp2(&[])); // empty
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
#[test]
|
|
811
|
+
fn test_extract_jp2_rust_logo_metadata() {
|
|
812
|
+
let bytes = include_bytes!("../../../../test_documents/images/rust-logo-512x512-blk.jp2");
|
|
813
|
+
let result = extract_image_metadata(bytes);
|
|
814
|
+
assert!(result.is_ok(), "Failed to extract JP2 metadata: {:?}", result.err());
|
|
815
|
+
let metadata = result.unwrap();
|
|
816
|
+
assert_eq!(metadata.width, 512);
|
|
817
|
+
assert_eq!(metadata.height, 512);
|
|
818
|
+
assert_eq!(metadata.format, "JPEG2000");
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
#[test]
|
|
822
|
+
fn test_extract_jp2_hadley_crater_metadata() {
|
|
823
|
+
let bytes = include_bytes!("../../../../test_documents/images/Hadley_Crater.jp2");
|
|
824
|
+
let result = extract_image_metadata(bytes);
|
|
825
|
+
assert!(result.is_ok(), "Failed to extract JP2 metadata: {:?}", result.err());
|
|
826
|
+
let metadata = result.unwrap();
|
|
827
|
+
assert!(metadata.width > 0);
|
|
828
|
+
assert!(metadata.height > 0);
|
|
829
|
+
assert_eq!(metadata.format, "JPEG2000");
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
#[test]
|
|
833
|
+
fn test_parse_jp2_boxes_invalid_data() {
|
|
834
|
+
let invalid = vec![0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A];
|
|
835
|
+
let result = decode_jp2_metadata(&invalid);
|
|
836
|
+
assert!(result.is_err());
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
#[test]
|
|
840
|
+
fn test_jp2_magic_detection_comprehensive() {
|
|
841
|
+
// Valid JP2 signature
|
|
842
|
+
assert!(is_jp2(&[
|
|
843
|
+
0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A
|
|
844
|
+
]));
|
|
845
|
+
// Not JP2
|
|
846
|
+
assert!(!is_jp2(&[0xFF, 0x4F, 0xFF, 0x51])); // J2K codestream
|
|
847
|
+
assert!(!is_jp2(&[0x89, 0x50, 0x4E, 0x47])); // PNG
|
|
848
|
+
assert!(!is_jp2(&[]));
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
#[cfg(all(test, feature = "ocr"))]
|
|
853
|
+
mod jp2_decode_tests {
|
|
854
|
+
use super::*;
|
|
855
|
+
|
|
856
|
+
#[test]
|
|
857
|
+
fn test_decode_jp2_to_rgb() {
|
|
858
|
+
let bytes = include_bytes!("../../../../test_documents/images/rust-logo-512x512-blk.jp2");
|
|
859
|
+
let rgb = decode_jp2_to_rgb(bytes).expect("Should decode JP2 to RGB");
|
|
860
|
+
assert_eq!(rgb.width(), 512);
|
|
861
|
+
assert_eq!(rgb.height(), 512);
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
#[test]
|
|
865
|
+
fn test_is_j2k() {
|
|
866
|
+
assert!(!is_j2k(&[]));
|
|
867
|
+
assert!(!is_j2k(&[0xFF]));
|
|
868
|
+
assert!(is_j2k(&[0xFF, 0x4F, 0xFF, 0x51, 0x00]));
|
|
869
|
+
assert!(!is_j2k(&[0xFF, 0x4F, 0x00, 0x51]));
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
#[test]
|
|
873
|
+
fn test_jbig2_magic_detection() {
|
|
874
|
+
assert!(is_jbig2(&[0x97, 0x4A, 0x42, 0x32, 0x0D, 0x0A, 0x1A, 0x0A, 0x01]));
|
|
875
|
+
assert!(!is_jbig2(&[0x89, 0x50, 0x4E, 0x47])); // PNG
|
|
876
|
+
assert!(!is_jbig2(&[]));
|
|
877
|
+
assert!(!is_jbig2(&[0x97, 0x4A])); // too short
|
|
878
|
+
}
|
|
492
879
|
}
|
|
@@ -27,7 +27,7 @@ pub mod html;
|
|
|
27
27
|
#[cfg(feature = "office")]
|
|
28
28
|
pub mod docx;
|
|
29
29
|
|
|
30
|
-
#[cfg(feature = "office")]
|
|
30
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
31
31
|
pub mod libreoffice;
|
|
32
32
|
|
|
33
33
|
#[cfg(feature = "office")]
|
|
@@ -69,7 +69,7 @@ pub use excel::{excel_to_markdown, read_excel_bytes, read_excel_file};
|
|
|
69
69
|
#[cfg(feature = "html")]
|
|
70
70
|
pub use html::{convert_html_to_markdown, process_html};
|
|
71
71
|
|
|
72
|
-
#[cfg(feature = "office")]
|
|
72
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
73
73
|
pub use libreoffice::{check_libreoffice_available, convert_doc_to_docx, convert_ppt_to_pptx};
|
|
74
74
|
|
|
75
75
|
#[cfg(feature = "office")]
|