kreuzberg 4.0.0.pre.rc.20 → 4.0.0.pre.rc.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/kreuzberg.gemspec +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/src/core/config.rs +3 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +20 -32
- data/vendor/kreuzberg/src/extractors/pdf.rs +8 -8
- data/vendor/kreuzberg/src/pdf/bindings.rs +14 -13
- data/vendor/kreuzberg/src/pdf/error.rs +89 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +2 -4
- data/vendor/kreuzberg/src/pdf/rendering.rs +9 -6
- data/vendor/kreuzberg/src/pdf/text.rs +3 -5
- data/vendor/kreuzberg-ffi/build.rs +11 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bc17da6af86a2e71001e5e5844c764fbabdc29e1b777b368d179fd505f8440f1
|
|
4
|
+
data.tar.gz: eae32510b4c628b8aa73dd943fc0adf60c316efc2e20d2910e84a0f60bf9da24
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 869652660b57b61427d58ef01d832efc6854aa20c232db1c5fe0d1ce8601a70368c675d4b904be18d86f5b8d8b4730dbbfb050744457b03cf69063060c7521e8
|
|
7
|
+
data.tar.gz: 0c534272a1ddf5427221fde1917ab682b86adc7c1773d49865612240134b7576959f2bb2e4101fd00a5193016e6e141e0d01d1d29cfd82e4e1f25d057185be8a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.21)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -66,7 +66,7 @@ GEM
|
|
|
66
66
|
rb-fsevent (0.11.2)
|
|
67
67
|
rb-inotify (0.11.1)
|
|
68
68
|
ffi (~> 1.0)
|
|
69
|
-
rb_sys (0.9.
|
|
69
|
+
rb_sys (0.9.119)
|
|
70
70
|
rake-compiler-dock (= 1.10.0)
|
|
71
71
|
rbs (3.10.0)
|
|
72
72
|
logger
|
|
@@ -147,7 +147,7 @@ DEPENDENCIES
|
|
|
147
147
|
pry-byebug (~> 3.10)
|
|
148
148
|
rake (~> 13.0)
|
|
149
149
|
rake-compiler (~> 1.2)
|
|
150
|
-
rb_sys (
|
|
150
|
+
rb_sys (= 0.9.119)
|
|
151
151
|
rbs (~> 3.0)
|
|
152
152
|
rspec (~> 3.12)
|
|
153
153
|
rubocop (~> 1.66)
|
data/kreuzberg.gemspec
CHANGED
|
@@ -201,7 +201,7 @@ Gem::Specification.new do |spec|
|
|
|
201
201
|
spec.add_development_dependency 'bundler', '~> 4.0'
|
|
202
202
|
spec.add_development_dependency 'rake', '~> 13.0'
|
|
203
203
|
spec.add_development_dependency 'rake-compiler', '~> 1.2'
|
|
204
|
-
spec.add_development_dependency 'rb_sys', '
|
|
204
|
+
spec.add_development_dependency 'rb_sys', '0.9.119'
|
|
205
205
|
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
206
206
|
unless Gem.win_platform?
|
|
207
207
|
spec.add_development_dependency 'rbs', '~> 3.0'
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1254,6 +1254,9 @@ enable_quality_processing = true
|
|
|
1254
1254
|
let config1 = ExtractionConfig::from_toml_file(&config_path).unwrap();
|
|
1255
1255
|
assert!(!config1.use_cache);
|
|
1256
1256
|
|
|
1257
|
+
// Sleep to ensure mtime changes (some filesystems have 1-second granularity)
|
|
1258
|
+
std::thread::sleep(std::time::Duration::from_secs(1));
|
|
1259
|
+
|
|
1257
1260
|
fs::write(
|
|
1258
1261
|
&config_path,
|
|
1259
1262
|
r#"
|
|
@@ -293,49 +293,32 @@ impl EpubExtractor {
|
|
|
293
293
|
}
|
|
294
294
|
|
|
295
295
|
/// Extract metadata from EPUB OPF file
|
|
296
|
-
fn extract_metadata(opf_xml: &str) -> Result<BTreeMap<String, serde_json::Value
|
|
297
|
-
let mut
|
|
296
|
+
fn extract_metadata(opf_xml: &str) -> Result<(OepbMetadata, BTreeMap<String, serde_json::Value>)> {
|
|
297
|
+
let mut additional_metadata = BTreeMap::new();
|
|
298
298
|
|
|
299
299
|
let (epub_metadata, _) = Self::parse_opf(opf_xml)?;
|
|
300
300
|
|
|
301
|
-
if let Some(
|
|
302
|
-
|
|
301
|
+
if let Some(identifier) = epub_metadata.identifier.clone() {
|
|
302
|
+
additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier));
|
|
303
303
|
}
|
|
304
304
|
|
|
305
|
-
if let Some(
|
|
306
|
-
|
|
307
|
-
metadata.insert("authors".to_string(), serde_json::json!(vec![creator]));
|
|
305
|
+
if let Some(publisher) = epub_metadata.publisher.clone() {
|
|
306
|
+
additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher));
|
|
308
307
|
}
|
|
309
308
|
|
|
310
|
-
if let Some(
|
|
311
|
-
|
|
309
|
+
if let Some(subject) = epub_metadata.subject.clone() {
|
|
310
|
+
additional_metadata.insert("subject".to_string(), serde_json::json!(subject));
|
|
312
311
|
}
|
|
313
312
|
|
|
314
|
-
if let Some(
|
|
315
|
-
|
|
313
|
+
if let Some(description) = epub_metadata.description.clone() {
|
|
314
|
+
additional_metadata.insert("description".to_string(), serde_json::json!(description));
|
|
316
315
|
}
|
|
317
316
|
|
|
318
|
-
if let Some(
|
|
319
|
-
|
|
317
|
+
if let Some(rights) = epub_metadata.rights.clone() {
|
|
318
|
+
additional_metadata.insert("rights".to_string(), serde_json::json!(rights));
|
|
320
319
|
}
|
|
321
320
|
|
|
322
|
-
|
|
323
|
-
metadata.insert("publisher".to_string(), serde_json::json!(publisher));
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
if let Some(subject) = epub_metadata.subject {
|
|
327
|
-
metadata.insert("subject".to_string(), serde_json::json!(subject));
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
if let Some(description) = epub_metadata.description {
|
|
331
|
-
metadata.insert("description".to_string(), serde_json::json!(description));
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
if let Some(rights) = epub_metadata.rights {
|
|
335
|
-
metadata.insert("rights".to_string(), serde_json::json!(rights));
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
Ok(metadata)
|
|
321
|
+
Ok((epub_metadata, additional_metadata))
|
|
339
322
|
}
|
|
340
323
|
|
|
341
324
|
/// Parse container.xml to find the OPF file path
|
|
@@ -564,13 +547,18 @@ impl DocumentExtractor for EpubExtractor {
|
|
|
564
547
|
|
|
565
548
|
let extracted_content = Self::extract_content(&mut archive, &opf_path, &manifest_dir)?;
|
|
566
549
|
|
|
567
|
-
let
|
|
568
|
-
let metadata_map: std::collections::HashMap<String, serde_json::Value> =
|
|
550
|
+
let (epub_metadata, additional_metadata) = Self::extract_metadata(&opf_xml)?;
|
|
551
|
+
let metadata_map: std::collections::HashMap<String, serde_json::Value> =
|
|
552
|
+
additional_metadata.into_iter().collect();
|
|
569
553
|
|
|
570
554
|
Ok(ExtractionResult {
|
|
571
555
|
content: extracted_content,
|
|
572
556
|
mime_type: mime_type.to_string(),
|
|
573
557
|
metadata: Metadata {
|
|
558
|
+
title: epub_metadata.title,
|
|
559
|
+
authors: epub_metadata.creator.map(|c| vec![c]),
|
|
560
|
+
language: epub_metadata.language,
|
|
561
|
+
created_at: epub_metadata.date,
|
|
574
562
|
additional: metadata_map,
|
|
575
563
|
..Default::default()
|
|
576
564
|
},
|
|
@@ -408,10 +408,10 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
408
408
|
}
|
|
409
409
|
},
|
|
410
410
|
)?;
|
|
411
|
-
let pdfium = Pdfium
|
|
411
|
+
let pdfium = Pdfium;
|
|
412
412
|
|
|
413
413
|
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
414
|
-
let err_msg = e
|
|
414
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
415
415
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
416
416
|
PdfError::PasswordRequired
|
|
417
417
|
} else {
|
|
@@ -433,10 +433,10 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
433
433
|
|
|
434
434
|
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
435
435
|
|
|
436
|
-
let pdfium = Pdfium
|
|
436
|
+
let pdfium = Pdfium;
|
|
437
437
|
|
|
438
438
|
let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
|
|
439
|
-
let err_msg = e
|
|
439
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
440
440
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
441
441
|
PdfError::PasswordRequired
|
|
442
442
|
} else {
|
|
@@ -465,10 +465,10 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
465
465
|
} else {
|
|
466
466
|
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
467
467
|
|
|
468
|
-
let pdfium = Pdfium
|
|
468
|
+
let pdfium = Pdfium;
|
|
469
469
|
|
|
470
470
|
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
471
|
-
let err_msg = e
|
|
471
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
472
472
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
473
473
|
PdfError::PasswordRequired
|
|
474
474
|
} else {
|
|
@@ -484,10 +484,10 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
484
484
|
{
|
|
485
485
|
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
486
486
|
|
|
487
|
-
let pdfium = Pdfium
|
|
487
|
+
let pdfium = Pdfium;
|
|
488
488
|
|
|
489
489
|
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
490
|
-
let err_msg = e
|
|
490
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
491
491
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
492
492
|
PdfError::PasswordRequired
|
|
493
493
|
} else {
|
|
@@ -82,7 +82,7 @@ fn bind_pdfium_impl() -> Result<(Option<PathBuf>, Box<dyn PdfiumLibraryBindings>
|
|
|
82
82
|
/// Instead of failing permanently, we recover by extracting the inner value from the
|
|
83
83
|
/// poisoned lock and proceeding. This ensures PDF extraction can continue even if an
|
|
84
84
|
/// earlier panic occurred, as long as the state is consistent.
|
|
85
|
-
pub(crate) fn bind_pdfium(map_err: fn(String) -> PdfError, context: &'static str) -> Result<
|
|
85
|
+
pub(crate) fn bind_pdfium(map_err: fn(String) -> PdfError, context: &'static str) -> Result<Pdfium, PdfError> {
|
|
86
86
|
let mut state = PDFIUM_STATE.lock().unwrap_or_else(|poisoned| {
|
|
87
87
|
// SAFETY: Recovering from a poisoned lock is safe here because:
|
|
88
88
|
// 1. The poisoned state still contains valid data (just a guard from a panicked thread)
|
|
@@ -95,27 +95,25 @@ pub(crate) fn bind_pdfium(map_err: fn(String) -> PdfError, context: &'static str
|
|
|
95
95
|
match &*state {
|
|
96
96
|
InitializationState::Uninitialized => match bind_pdfium_impl() {
|
|
97
97
|
Ok((lib_dir, bindings)) => {
|
|
98
|
-
// Initialize Pdfium singleton with the bindings
|
|
99
|
-
let
|
|
98
|
+
// Initialize Pdfium singleton with the bindings and return it
|
|
99
|
+
let pdfium = Pdfium::new(bindings);
|
|
100
100
|
*state = InitializationState::Initialized { lib_dir };
|
|
101
|
+
Ok(pdfium)
|
|
101
102
|
}
|
|
102
103
|
Err(err) => {
|
|
103
104
|
*state = InitializationState::Failed(err.clone());
|
|
104
|
-
|
|
105
|
+
Err(map_err(format!("Pdfium initialization failed ({}): {}", context, err)))
|
|
105
106
|
}
|
|
106
107
|
},
|
|
107
|
-
InitializationState::Failed(err) =>
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
)));
|
|
112
|
-
}
|
|
108
|
+
InitializationState::Failed(err) => Err(map_err(format!(
|
|
109
|
+
"Pdfium initialization previously failed ({}): {}",
|
|
110
|
+
context, err
|
|
111
|
+
))),
|
|
113
112
|
InitializationState::Initialized { .. } => {
|
|
114
|
-
// Already initialized,
|
|
113
|
+
// Already initialized, return a new accessor to the singleton
|
|
114
|
+
Ok(Pdfium)
|
|
115
115
|
}
|
|
116
116
|
}
|
|
117
|
-
|
|
118
|
-
Ok(())
|
|
119
117
|
}
|
|
120
118
|
|
|
121
119
|
#[cfg(test)]
|
|
@@ -128,6 +126,9 @@ mod tests {
|
|
|
128
126
|
// First call should initialize
|
|
129
127
|
let result = bind_pdfium(PdfError::TextExtractionFailed, "test context");
|
|
130
128
|
assert!(result.is_ok(), "First bind_pdfium call should succeed");
|
|
129
|
+
// Verify the returned Pdfium instance is usable
|
|
130
|
+
let pdfium = result.unwrap();
|
|
131
|
+
assert!(pdfium.is_pdfium_ready(), "Pdfium should be initialized");
|
|
131
132
|
}
|
|
132
133
|
|
|
133
134
|
#[test]
|
|
@@ -52,6 +52,34 @@ impl From<lopdf::Error> for PdfError {
|
|
|
52
52
|
|
|
53
53
|
pub type Result<T> = std::result::Result<T, PdfError>;
|
|
54
54
|
|
|
55
|
+
/// Format a pdfium error for display.
|
|
56
|
+
///
|
|
57
|
+
/// The kreuzberg-pdfium-render fork's error type doesn't implement Display,
|
|
58
|
+
/// so Debug formatting produces messages like "PdfiumLibraryInternalError(FormatError,)"
|
|
59
|
+
/// with trailing commas and parentheses. This function cleans up the formatting.
|
|
60
|
+
pub(crate) fn format_pdfium_error<E: std::fmt::Debug>(error: E) -> String {
|
|
61
|
+
let debug_msg = format!("{:?}", error);
|
|
62
|
+
|
|
63
|
+
// Extract the variant name and clean up Debug formatting
|
|
64
|
+
// "PdfiumLibraryInternalError(FormatError,)" -> "PdfiumLibraryInternalError: FormatError"
|
|
65
|
+
// "SomeError" -> "SomeError"
|
|
66
|
+
if let Some(paren_idx) = debug_msg.find('(') {
|
|
67
|
+
let variant = &debug_msg[..paren_idx];
|
|
68
|
+
let inner = &debug_msg[paren_idx + 1..];
|
|
69
|
+
|
|
70
|
+
// Remove trailing ",)" or ")"
|
|
71
|
+
let inner_clean = inner.trim_end_matches(')').trim_end_matches(',');
|
|
72
|
+
|
|
73
|
+
if inner_clean.is_empty() {
|
|
74
|
+
variant.to_string()
|
|
75
|
+
} else {
|
|
76
|
+
format!("{}: {}", variant, inner_clean)
|
|
77
|
+
}
|
|
78
|
+
} else {
|
|
79
|
+
debug_msg
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
55
83
|
#[cfg(test)]
|
|
56
84
|
mod tests {
|
|
57
85
|
use super::*;
|
|
@@ -135,4 +163,65 @@ mod tests {
|
|
|
135
163
|
let err = PdfError::FontLoadingFailed("missing font file".to_string());
|
|
136
164
|
assert_eq!(err.to_string(), "Font loading failed: missing font file");
|
|
137
165
|
}
|
|
166
|
+
|
|
167
|
+
#[test]
|
|
168
|
+
fn test_format_pdfium_error_with_inner_value() {
|
|
169
|
+
// Simulate pdfium error: "PdfiumLibraryInternalError(FormatError,)"
|
|
170
|
+
#[derive(Debug)]
|
|
171
|
+
#[allow(dead_code)]
|
|
172
|
+
struct MockError(String);
|
|
173
|
+
|
|
174
|
+
let error = MockError("FormatError,".to_string());
|
|
175
|
+
let formatted = format_pdfium_error(error);
|
|
176
|
+
// Should clean up the trailing comma
|
|
177
|
+
assert!(formatted.contains("MockError"));
|
|
178
|
+
assert!(formatted.contains("FormatError"));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn test_format_pdfium_error_simple() {
|
|
183
|
+
// Simulate simple error without parentheses
|
|
184
|
+
#[derive(Debug)]
|
|
185
|
+
struct SimpleError;
|
|
186
|
+
|
|
187
|
+
let formatted = format_pdfium_error(SimpleError);
|
|
188
|
+
assert_eq!(formatted, "SimpleError");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
#[test]
|
|
192
|
+
fn test_format_pdfium_error_empty_inner() {
|
|
193
|
+
// Simulate error with empty inner: "SomeError()"
|
|
194
|
+
#[derive(Debug)]
|
|
195
|
+
struct EmptyInner;
|
|
196
|
+
|
|
197
|
+
let formatted = format_pdfium_error(EmptyInner);
|
|
198
|
+
// Will be "EmptyInner" since the formatting doesn't add parentheses
|
|
199
|
+
assert_eq!(formatted, "EmptyInner");
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
#[test]
|
|
203
|
+
fn test_format_pdfium_error_cleans_trailing_comma() {
|
|
204
|
+
// This test simulates the actual pdfium error format
|
|
205
|
+
// "PdfiumLibraryInternalError(FormatError,)" should become
|
|
206
|
+
// "PdfiumLibraryInternalError: FormatError"
|
|
207
|
+
#[derive(Debug)]
|
|
208
|
+
#[allow(dead_code)]
|
|
209
|
+
enum PdfiumError {
|
|
210
|
+
PdfiumLibraryInternalError(InternalError),
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#[derive(Debug)]
|
|
214
|
+
#[allow(dead_code)]
|
|
215
|
+
enum InternalError {
|
|
216
|
+
FormatError,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
let error = PdfiumError::PdfiumLibraryInternalError(InternalError::FormatError);
|
|
220
|
+
let formatted = format_pdfium_error(error);
|
|
221
|
+
|
|
222
|
+
// Should not contain trailing comma or redundant parentheses
|
|
223
|
+
assert!(!formatted.contains(",)"));
|
|
224
|
+
assert!(formatted.contains("PdfiumLibraryInternalError"));
|
|
225
|
+
assert!(formatted.contains("FormatError"));
|
|
226
|
+
}
|
|
138
227
|
}
|
|
@@ -86,12 +86,10 @@ pub fn extract_metadata(pdf_bytes: &[u8]) -> Result<PdfMetadata> {
|
|
|
86
86
|
///
|
|
87
87
|
/// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
|
|
88
88
|
pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<PdfMetadata> {
|
|
89
|
-
bind_pdfium(PdfError::MetadataExtractionFailed, "metadata extraction")?;
|
|
90
|
-
|
|
91
|
-
let pdfium = Pdfium {};
|
|
89
|
+
let pdfium = bind_pdfium(PdfError::MetadataExtractionFailed, "metadata extraction")?;
|
|
92
90
|
|
|
93
91
|
let document = pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
94
|
-
let err_msg = e
|
|
92
|
+
let err_msg = super::error::format_pdfium_error(e);
|
|
95
93
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
96
94
|
PdfError::InvalidPassword
|
|
97
95
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -33,9 +33,7 @@ pub struct PdfRenderer {
|
|
|
33
33
|
|
|
34
34
|
impl PdfRenderer {
|
|
35
35
|
pub fn new() -> Result<Self> {
|
|
36
|
-
bind_pdfium(PdfError::RenderingFailed, "page rendering")?;
|
|
37
|
-
|
|
38
|
-
let pdfium = Pdfium {};
|
|
36
|
+
let pdfium = bind_pdfium(PdfError::RenderingFailed, "page rendering")?;
|
|
39
37
|
Ok(Self { pdfium })
|
|
40
38
|
}
|
|
41
39
|
|
|
@@ -56,7 +54,7 @@ impl PdfRenderer {
|
|
|
56
54
|
password: Option<&str>,
|
|
57
55
|
) -> Result<DynamicImage> {
|
|
58
56
|
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
59
|
-
let err_msg = e
|
|
57
|
+
let err_msg = super::error::format_pdfium_error(e);
|
|
60
58
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
61
59
|
PdfError::InvalidPassword
|
|
62
60
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -114,7 +112,7 @@ impl PdfRenderer {
|
|
|
114
112
|
password: Option<&str>,
|
|
115
113
|
) -> Result<Vec<DynamicImage>> {
|
|
116
114
|
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
117
|
-
let err_msg = e
|
|
115
|
+
let err_msg = super::error::format_pdfium_error(e);
|
|
118
116
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
119
117
|
PdfError::InvalidPassword
|
|
120
118
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -241,7 +239,12 @@ mod tests {
|
|
|
241
239
|
|
|
242
240
|
#[test]
|
|
243
241
|
fn test_renderer_size() {
|
|
244
|
-
|
|
242
|
+
// PdfRenderer may be a zero-sized type (ZST) since Pdfium is a ZST.
|
|
243
|
+
// The important thing is that the size is consistent and the type is valid.
|
|
244
|
+
// We just verify the type can be instantiated rather than checking specific size.
|
|
245
|
+
use std::mem::size_of;
|
|
246
|
+
let _size = size_of::<PdfRenderer>();
|
|
247
|
+
// If this compiles and runs, the type is valid regardless of size
|
|
245
248
|
}
|
|
246
249
|
|
|
247
250
|
#[test]
|
|
@@ -19,9 +19,7 @@ pub struct PdfTextExtractor {
|
|
|
19
19
|
|
|
20
20
|
impl PdfTextExtractor {
|
|
21
21
|
pub fn new() -> Result<Self> {
|
|
22
|
-
bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
|
|
23
|
-
|
|
24
|
-
let pdfium = Pdfium {};
|
|
22
|
+
let pdfium = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
|
|
25
23
|
Ok(Self { pdfium })
|
|
26
24
|
}
|
|
27
25
|
|
|
@@ -31,7 +29,7 @@ impl PdfTextExtractor {
|
|
|
31
29
|
|
|
32
30
|
pub fn extract_text_with_password(&self, pdf_bytes: &[u8], password: Option<&str>) -> Result<String> {
|
|
33
31
|
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
34
|
-
let err_msg = e
|
|
32
|
+
let err_msg = super::error::format_pdfium_error(e);
|
|
35
33
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
36
34
|
PdfError::InvalidPassword
|
|
37
35
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -67,7 +65,7 @@ impl PdfTextExtractor {
|
|
|
67
65
|
|
|
68
66
|
pub fn get_page_count(&self, pdf_bytes: &[u8]) -> Result<usize> {
|
|
69
67
|
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, None).map_err(|e| {
|
|
70
|
-
let err_msg = e
|
|
68
|
+
let err_msg = super::error::format_pdfium_error(e);
|
|
71
69
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
72
70
|
PdfError::PasswordRequired
|
|
73
71
|
} else {
|
|
@@ -154,6 +154,17 @@ fn copy_pdfium_from_dir(src_dir: &Path, dest_dir: &Path) -> Result<(), String> {
|
|
|
154
154
|
|
|
155
155
|
if file_name_str.starts_with("libpdfium") || file_name_str.starts_with("pdfium") {
|
|
156
156
|
let dest_file = dest_dir.join(file_name);
|
|
157
|
+
|
|
158
|
+
// On Windows, skip copy if destination already exists and is accessible
|
|
159
|
+
// This avoids "Access denied" errors when the DLL is in use
|
|
160
|
+
if dest_file.exists() {
|
|
161
|
+
eprintln!(
|
|
162
|
+
"PDFium library already exists at {}, skipping copy",
|
|
163
|
+
dest_file.display()
|
|
164
|
+
);
|
|
165
|
+
return Ok(());
|
|
166
|
+
}
|
|
167
|
+
|
|
157
168
|
match fs::copy(&path, &dest_file) {
|
|
158
169
|
Ok(bytes_copied) => {
|
|
159
170
|
eprintln!(
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.0.0.pre.rc.
|
|
4
|
+
version: 4.0.0.pre.rc.21
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -56,14 +56,14 @@ dependencies:
|
|
|
56
56
|
name: rb_sys
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
|
-
- -
|
|
59
|
+
- - '='
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
61
|
version: 0.9.119
|
|
62
62
|
type: :development
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
|
-
- -
|
|
66
|
+
- - '='
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: 0.9.119
|
|
69
69
|
- !ruby/object:Gem::Dependency
|