kreuzberg 4.0.0.pre.rc.19 → 4.0.0.pre.rc.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +2 -0
- data/kreuzberg.gemspec +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -2
- data/vendor/kreuzberg/examples/bench_fixes.rs +74 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +65 -0
- data/vendor/kreuzberg/src/core/config.rs +3 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +20 -32
- data/vendor/kreuzberg/src/extractors/pdf.rs +26 -29
- data/vendor/kreuzberg/src/pdf/bindings.rs +15 -44
- data/vendor/kreuzberg/src/pdf/error.rs +97 -0
- data/vendor/kreuzberg/src/pdf/fonts.rs +384 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +3 -5
- data/vendor/kreuzberg/src/pdf/mod.rs +4 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +10 -7
- data/vendor/kreuzberg/src/pdf/text.rs +53 -5
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +1 -1
- data/vendor/kreuzberg-ffi/build.rs +11 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +7 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bc17da6af86a2e71001e5e5844c764fbabdc29e1b777b368d179fd505f8440f1
|
|
4
|
+
data.tar.gz: eae32510b4c628b8aa73dd943fc0adf60c316efc2e20d2910e84a0f60bf9da24
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 869652660b57b61427d58ef01d832efc6854aa20c232db1c5fe0d1ce8601a70368c675d4b904be18d86f5b8d8b4730dbbfb050744457b03cf69063060c7521e8
|
|
7
|
+
data.tar.gz: 0c534272a1ddf5427221fde1917ab682b86adc7c1773d49865612240134b7576959f2bb2e4101fd00a5193016e6e141e0d01d1d29cfd82e4e1f25d057185be8a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.21)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -66,7 +66,7 @@ GEM
|
|
|
66
66
|
rb-fsevent (0.11.2)
|
|
67
67
|
rb-inotify (0.11.1)
|
|
68
68
|
ffi (~> 1.0)
|
|
69
|
-
rb_sys (0.9.
|
|
69
|
+
rb_sys (0.9.119)
|
|
70
70
|
rake-compiler-dock (= 1.10.0)
|
|
71
71
|
rbs (3.10.0)
|
|
72
72
|
logger
|
|
@@ -147,7 +147,7 @@ DEPENDENCIES
|
|
|
147
147
|
pry-byebug (~> 3.10)
|
|
148
148
|
rake (~> 13.0)
|
|
149
149
|
rake-compiler (~> 1.2)
|
|
150
|
-
rb_sys (
|
|
150
|
+
rb_sys (= 0.9.119)
|
|
151
151
|
rbs (~> 3.0)
|
|
152
152
|
rspec (~> 3.12)
|
|
153
153
|
rubocop (~> 1.66)
|
data/kreuzberg.gemspec
CHANGED
|
@@ -201,7 +201,7 @@ Gem::Specification.new do |spec|
|
|
|
201
201
|
spec.add_development_dependency 'bundler', '~> 4.0'
|
|
202
202
|
spec.add_development_dependency 'rake', '~> 13.0'
|
|
203
203
|
spec.add_development_dependency 'rake-compiler', '~> 1.2'
|
|
204
|
-
spec.add_development_dependency 'rb_sys', '
|
|
204
|
+
spec.add_development_dependency 'rb_sys', '0.9.119'
|
|
205
205
|
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
206
206
|
unless Gem.win_platform?
|
|
207
207
|
spec.add_development_dependency 'rbs', '~> 3.0'
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.0.0-rc.
|
|
3
|
+
version = "4.0.0-rc.21"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -167,7 +167,10 @@ reqwest = { version = "0.12.25", default-features = false, features = [
|
|
|
167
167
|
"rustls-tls",
|
|
168
168
|
], optional = true }
|
|
169
169
|
# Format extractors (optional)
|
|
170
|
-
pdfium-render = {
|
|
170
|
+
pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
|
|
171
|
+
"thread_safe",
|
|
172
|
+
"image_latest",
|
|
173
|
+
], optional = true }
|
|
171
174
|
lopdf = { version = "0.38.0", optional = true }
|
|
172
175
|
calamine = { version = "0.32.0", features = ["dates"], optional = true }
|
|
173
176
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
use kreuzberg::{ExtractionConfig, extract_file_sync};
|
|
2
|
+
use std::path::PathBuf;
|
|
3
|
+
use std::time::Instant;
|
|
4
|
+
|
|
5
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
6
|
+
let test_pdfs = [
|
|
7
|
+
(
|
|
8
|
+
"a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
|
|
9
|
+
"Academic Paper (18 fonts)",
|
|
10
|
+
),
|
|
11
|
+
(
|
|
12
|
+
"5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf",
|
|
13
|
+
"Intel PDF (5 fonts)",
|
|
14
|
+
),
|
|
15
|
+
("fake_memo.pdf", "Tiny Memo (3-5 fonts)"),
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
let config = ExtractionConfig {
|
|
19
|
+
use_cache: false,
|
|
20
|
+
..Default::default()
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
println!("=== PDFium Fork Fixes Benchmark ===\n");
|
|
24
|
+
println!("Testing warm execution fix and font overhead fix\n");
|
|
25
|
+
|
|
26
|
+
for (file, description) in &test_pdfs {
|
|
27
|
+
let path = PathBuf::from(format!("test_documents/pdfs/{}", file));
|
|
28
|
+
println!("=== {} ===", description);
|
|
29
|
+
println!("File: {}\n", file);
|
|
30
|
+
|
|
31
|
+
// Cold start
|
|
32
|
+
let start = Instant::now();
|
|
33
|
+
let result = extract_file_sync(&path, None, &config)?;
|
|
34
|
+
let cold = start.elapsed();
|
|
35
|
+
println!("Cold start: {:>8.2} ms", cold.as_secs_f64() * 1000.0);
|
|
36
|
+
println!("Text length: {} chars\n", result.content.len());
|
|
37
|
+
|
|
38
|
+
// Warm iterations
|
|
39
|
+
let mut warm_times = Vec::new();
|
|
40
|
+
for i in 1..=5 {
|
|
41
|
+
let start = Instant::now();
|
|
42
|
+
let _ = extract_file_sync(&path, None, &config)?;
|
|
43
|
+
let warm = start.elapsed();
|
|
44
|
+
warm_times.push(warm);
|
|
45
|
+
let speedup = cold.as_micros() as f64 / warm.as_micros() as f64;
|
|
46
|
+
println!(
|
|
47
|
+
"Warm {:>2}: {:>8.2} ms ({:>5.2}x faster than cold)",
|
|
48
|
+
i,
|
|
49
|
+
warm.as_secs_f64() * 1000.0,
|
|
50
|
+
speedup
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Statistics
|
|
55
|
+
let avg_warm = warm_times.iter().sum::<std::time::Duration>() / warm_times.len() as u32;
|
|
56
|
+
let avg_speedup = cold.as_micros() as f64 / avg_warm.as_micros() as f64;
|
|
57
|
+
println!(
|
|
58
|
+
"\nAverage warm: {:>8.2} ms ({:>5.2}x faster than cold)",
|
|
59
|
+
avg_warm.as_secs_f64() * 1000.0,
|
|
60
|
+
avg_speedup
|
|
61
|
+
);
|
|
62
|
+
println!("\n{}\n", "=".repeat(60));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
println!("\n=== Success Criteria ===");
|
|
66
|
+
println!("✓ Warm Execution Fix:");
|
|
67
|
+
println!(" - Warm times should be 1-3x faster than cold (realistic)");
|
|
68
|
+
println!(" - NOT 100-700x faster (the bug we fixed)");
|
|
69
|
+
println!("\n✓ Font Overhead Fix:");
|
|
70
|
+
println!(" - Academic Paper cold: ~130-145ms (matches baseline)");
|
|
71
|
+
println!(" - NOT 180-195ms (the regression we fixed)");
|
|
72
|
+
|
|
73
|
+
Ok(())
|
|
74
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
use kreuzberg::{ExtractionConfig, extract_file};
|
|
2
|
+
use std::time::Instant;
|
|
3
|
+
|
|
4
|
+
#[tokio::main]
|
|
5
|
+
async fn main() {
|
|
6
|
+
let config = ExtractionConfig {
|
|
7
|
+
use_cache: false,
|
|
8
|
+
..Default::default()
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
println!("Testing PDF extraction with cleaned pdfium-render fork...\n");
|
|
12
|
+
|
|
13
|
+
// Test 1: Simple extraction
|
|
14
|
+
println!("Test 1: fake_memo.pdf");
|
|
15
|
+
let start = Instant::now();
|
|
16
|
+
match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
|
|
17
|
+
Ok(result) => {
|
|
18
|
+
let duration = start.elapsed();
|
|
19
|
+
println!(" ✓ Success! Duration: {:?}", duration);
|
|
20
|
+
println!(" ✓ Text length: {} chars", result.content.len());
|
|
21
|
+
}
|
|
22
|
+
Err(e) => {
|
|
23
|
+
println!(" ✗ Failed: {}", e);
|
|
24
|
+
std::process::exit(1);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Test 2: Warm iteration
|
|
29
|
+
println!("\nTest 2: Warm iteration");
|
|
30
|
+
let start = Instant::now();
|
|
31
|
+
match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
|
|
32
|
+
Ok(result) => {
|
|
33
|
+
let duration = start.elapsed();
|
|
34
|
+
println!(" ✓ Success! Duration: {:?}", duration);
|
|
35
|
+
println!(" ✓ Text length: {} chars", result.content.len());
|
|
36
|
+
}
|
|
37
|
+
Err(e) => {
|
|
38
|
+
println!(" ✗ Failed: {}", e);
|
|
39
|
+
std::process::exit(1);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Test 3: Academic Paper (font-heavy)
|
|
44
|
+
println!("\nTest 3: Academic Paper (18 fonts)");
|
|
45
|
+
let start = Instant::now();
|
|
46
|
+
match extract_file(
|
|
47
|
+
"test_documents/pdfs/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
|
|
48
|
+
None,
|
|
49
|
+
&config,
|
|
50
|
+
)
|
|
51
|
+
.await
|
|
52
|
+
{
|
|
53
|
+
Ok(result) => {
|
|
54
|
+
let duration = start.elapsed();
|
|
55
|
+
println!(" ✓ Success! Duration: {:?}", duration);
|
|
56
|
+
println!(" ✓ Text length: {} chars", result.content.len());
|
|
57
|
+
}
|
|
58
|
+
Err(e) => {
|
|
59
|
+
println!(" ✗ Failed: {}", e);
|
|
60
|
+
std::process::exit(1);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
println!("\n✅ All tests passed! Cleaned pdfium-render fork is working correctly.");
|
|
65
|
+
}
|
|
@@ -1254,6 +1254,9 @@ enable_quality_processing = true
|
|
|
1254
1254
|
let config1 = ExtractionConfig::from_toml_file(&config_path).unwrap();
|
|
1255
1255
|
assert!(!config1.use_cache);
|
|
1256
1256
|
|
|
1257
|
+
// Sleep to ensure mtime changes (some filesystems have 1-second granularity)
|
|
1258
|
+
std::thread::sleep(std::time::Duration::from_secs(1));
|
|
1259
|
+
|
|
1257
1260
|
fs::write(
|
|
1258
1261
|
&config_path,
|
|
1259
1262
|
r#"
|
|
@@ -293,49 +293,32 @@ impl EpubExtractor {
|
|
|
293
293
|
}
|
|
294
294
|
|
|
295
295
|
/// Extract metadata from EPUB OPF file
|
|
296
|
-
fn extract_metadata(opf_xml: &str) -> Result<BTreeMap<String, serde_json::Value
|
|
297
|
-
let mut
|
|
296
|
+
fn extract_metadata(opf_xml: &str) -> Result<(OepbMetadata, BTreeMap<String, serde_json::Value>)> {
|
|
297
|
+
let mut additional_metadata = BTreeMap::new();
|
|
298
298
|
|
|
299
299
|
let (epub_metadata, _) = Self::parse_opf(opf_xml)?;
|
|
300
300
|
|
|
301
|
-
if let Some(
|
|
302
|
-
|
|
301
|
+
if let Some(identifier) = epub_metadata.identifier.clone() {
|
|
302
|
+
additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier));
|
|
303
303
|
}
|
|
304
304
|
|
|
305
|
-
if let Some(
|
|
306
|
-
|
|
307
|
-
metadata.insert("authors".to_string(), serde_json::json!(vec![creator]));
|
|
305
|
+
if let Some(publisher) = epub_metadata.publisher.clone() {
|
|
306
|
+
additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher));
|
|
308
307
|
}
|
|
309
308
|
|
|
310
|
-
if let Some(
|
|
311
|
-
|
|
309
|
+
if let Some(subject) = epub_metadata.subject.clone() {
|
|
310
|
+
additional_metadata.insert("subject".to_string(), serde_json::json!(subject));
|
|
312
311
|
}
|
|
313
312
|
|
|
314
|
-
if let Some(
|
|
315
|
-
|
|
313
|
+
if let Some(description) = epub_metadata.description.clone() {
|
|
314
|
+
additional_metadata.insert("description".to_string(), serde_json::json!(description));
|
|
316
315
|
}
|
|
317
316
|
|
|
318
|
-
if let Some(
|
|
319
|
-
|
|
317
|
+
if let Some(rights) = epub_metadata.rights.clone() {
|
|
318
|
+
additional_metadata.insert("rights".to_string(), serde_json::json!(rights));
|
|
320
319
|
}
|
|
321
320
|
|
|
322
|
-
|
|
323
|
-
metadata.insert("publisher".to_string(), serde_json::json!(publisher));
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
if let Some(subject) = epub_metadata.subject {
|
|
327
|
-
metadata.insert("subject".to_string(), serde_json::json!(subject));
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
if let Some(description) = epub_metadata.description {
|
|
331
|
-
metadata.insert("description".to_string(), serde_json::json!(description));
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
if let Some(rights) = epub_metadata.rights {
|
|
335
|
-
metadata.insert("rights".to_string(), serde_json::json!(rights));
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
Ok(metadata)
|
|
321
|
+
Ok((epub_metadata, additional_metadata))
|
|
339
322
|
}
|
|
340
323
|
|
|
341
324
|
/// Parse container.xml to find the OPF file path
|
|
@@ -564,13 +547,18 @@ impl DocumentExtractor for EpubExtractor {
|
|
|
564
547
|
|
|
565
548
|
let extracted_content = Self::extract_content(&mut archive, &opf_path, &manifest_dir)?;
|
|
566
549
|
|
|
567
|
-
let
|
|
568
|
-
let metadata_map: std::collections::HashMap<String, serde_json::Value> =
|
|
550
|
+
let (epub_metadata, additional_metadata) = Self::extract_metadata(&opf_xml)?;
|
|
551
|
+
let metadata_map: std::collections::HashMap<String, serde_json::Value> =
|
|
552
|
+
additional_metadata.into_iter().collect();
|
|
569
553
|
|
|
570
554
|
Ok(ExtractionResult {
|
|
571
555
|
content: extracted_content,
|
|
572
556
|
mime_type: mime_type.to_string(),
|
|
573
557
|
metadata: Metadata {
|
|
558
|
+
title: epub_metadata.title,
|
|
559
|
+
authors: epub_metadata.creator.map(|c| vec![c]),
|
|
560
|
+
language: epub_metadata.language,
|
|
561
|
+
created_at: epub_metadata.date,
|
|
574
562
|
additional: metadata_map,
|
|
575
563
|
..Default::default()
|
|
576
564
|
},
|
|
@@ -392,26 +392,26 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
392
392
|
{
|
|
393
393
|
// For WASM targets, PDFium must be properly initialized in the environment.
|
|
394
394
|
// The error message will direct users to the documentation for setup requirements.
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
source: None,
|
|
406
|
-
}
|
|
407
|
-
} else {
|
|
408
|
-
pdf_err.into()
|
|
395
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium").map_err(
|
|
396
|
+
|pdf_err| {
|
|
397
|
+
// Provide context-specific error for WASM PDF failures
|
|
398
|
+
if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
|
|
399
|
+
crate::error::KreuzbergError::Parsing {
|
|
400
|
+
message: "PDF extraction requires proper WASM module initialization. \
|
|
401
|
+
Ensure your WASM environment is set up with PDFium support. \
|
|
402
|
+
See: https://docs.kreuzberg.dev/wasm/pdf"
|
|
403
|
+
.to_string(),
|
|
404
|
+
source: None,
|
|
409
405
|
}
|
|
410
|
-
}
|
|
411
|
-
|
|
406
|
+
} else {
|
|
407
|
+
pdf_err.into()
|
|
408
|
+
}
|
|
409
|
+
},
|
|
410
|
+
)?;
|
|
411
|
+
let pdfium = Pdfium;
|
|
412
412
|
|
|
413
413
|
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
414
|
-
let err_msg = e
|
|
414
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
415
415
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
416
416
|
PdfError::PasswordRequired
|
|
417
417
|
} else {
|
|
@@ -431,13 +431,12 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
431
431
|
tokio::task::spawn_blocking(move || {
|
|
432
432
|
let _guard = span.entered();
|
|
433
433
|
|
|
434
|
-
|
|
435
|
-
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
434
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
436
435
|
|
|
437
|
-
let pdfium = Pdfium
|
|
436
|
+
let pdfium = Pdfium;
|
|
438
437
|
|
|
439
438
|
let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
|
|
440
|
-
let err_msg = e
|
|
439
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
441
440
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
442
441
|
PdfError::PasswordRequired
|
|
443
442
|
} else {
|
|
@@ -464,13 +463,12 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
464
463
|
.await
|
|
465
464
|
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
466
465
|
} else {
|
|
467
|
-
|
|
468
|
-
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
466
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
469
467
|
|
|
470
|
-
let pdfium = Pdfium
|
|
468
|
+
let pdfium = Pdfium;
|
|
471
469
|
|
|
472
470
|
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
473
|
-
let err_msg = e
|
|
471
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
474
472
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
475
473
|
PdfError::PasswordRequired
|
|
476
474
|
} else {
|
|
@@ -484,13 +482,12 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
484
482
|
}
|
|
485
483
|
#[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
|
|
486
484
|
{
|
|
487
|
-
|
|
488
|
-
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
485
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
489
486
|
|
|
490
|
-
let pdfium = Pdfium
|
|
487
|
+
let pdfium = Pdfium;
|
|
491
488
|
|
|
492
489
|
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
493
|
-
let err_msg = e
|
|
490
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
494
491
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
495
492
|
PdfError::PasswordRequired
|
|
496
493
|
} else {
|
|
@@ -82,10 +82,7 @@ fn bind_pdfium_impl() -> Result<(Option<PathBuf>, Box<dyn PdfiumLibraryBindings>
|
|
|
82
82
|
/// Instead of failing permanently, we recover by extracting the inner value from the
|
|
83
83
|
/// poisoned lock and proceeding. This ensures PDF extraction can continue even if an
|
|
84
84
|
/// earlier panic occurred, as long as the state is consistent.
|
|
85
|
-
pub(crate) fn bind_pdfium(
|
|
86
|
-
map_err: fn(String) -> PdfError,
|
|
87
|
-
context: &'static str,
|
|
88
|
-
) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
|
|
85
|
+
pub(crate) fn bind_pdfium(map_err: fn(String) -> PdfError, context: &'static str) -> Result<Pdfium, PdfError> {
|
|
89
86
|
let mut state = PDFIUM_STATE.lock().unwrap_or_else(|poisoned| {
|
|
90
87
|
// SAFETY: Recovering from a poisoned lock is safe here because:
|
|
91
88
|
// 1. The poisoned state still contains valid data (just a guard from a panicked thread)
|
|
@@ -97,55 +94,26 @@ pub(crate) fn bind_pdfium(
|
|
|
97
94
|
// Initialize on first call
|
|
98
95
|
match &*state {
|
|
99
96
|
InitializationState::Uninitialized => match bind_pdfium_impl() {
|
|
100
|
-
Ok((lib_dir,
|
|
97
|
+
Ok((lib_dir, bindings)) => {
|
|
98
|
+
// Initialize Pdfium singleton with the bindings and return it
|
|
99
|
+
let pdfium = Pdfium::new(bindings);
|
|
101
100
|
*state = InitializationState::Initialized { lib_dir };
|
|
101
|
+
Ok(pdfium)
|
|
102
102
|
}
|
|
103
103
|
Err(err) => {
|
|
104
104
|
*state = InitializationState::Failed(err.clone());
|
|
105
|
-
|
|
105
|
+
Err(map_err(format!("Pdfium initialization failed ({}): {}", context, err)))
|
|
106
106
|
}
|
|
107
107
|
},
|
|
108
|
-
InitializationState::Failed(err) =>
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)));
|
|
113
|
-
}
|
|
108
|
+
InitializationState::Failed(err) => Err(map_err(format!(
|
|
109
|
+
"Pdfium initialization previously failed ({}): {}",
|
|
110
|
+
context, err
|
|
111
|
+
))),
|
|
114
112
|
InitializationState::Initialized { .. } => {
|
|
115
|
-
// Already initialized,
|
|
116
|
-
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
// Create fresh bindings from cached state
|
|
120
|
-
#[cfg(all(feature = "pdf", feature = "bundled-pdfium", not(target_arch = "wasm32")))]
|
|
121
|
-
{
|
|
122
|
-
match &*state {
|
|
123
|
-
InitializationState::Initialized { lib_dir: Some(lib_dir) } => {
|
|
124
|
-
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
|
|
125
|
-
.map_err(|e| map_err(format!("Failed to create Pdfium bindings ({}): {}", context, e)))
|
|
126
|
-
}
|
|
127
|
-
_ => {
|
|
128
|
-
// This should not happen as state is guaranteed to be Initialized here
|
|
129
|
-
Err(map_err(format!(
|
|
130
|
-
"Internal error: Pdfium state not properly initialized ({})",
|
|
131
|
-
context
|
|
132
|
-
)))
|
|
133
|
-
}
|
|
113
|
+
// Already initialized, return a new accessor to the singleton
|
|
114
|
+
Ok(Pdfium)
|
|
134
115
|
}
|
|
135
116
|
}
|
|
136
|
-
|
|
137
|
-
// For system pdfium or WASM, create fresh bindings
|
|
138
|
-
#[cfg(all(feature = "pdf", feature = "bundled-pdfium", target_arch = "wasm32"))]
|
|
139
|
-
{
|
|
140
|
-
Pdfium::bind_to_system_library()
|
|
141
|
-
.map_err(|e| map_err(format!("Failed to create Pdfium bindings ({}): {}", context, e)))
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
#[cfg(all(feature = "pdf", not(feature = "bundled-pdfium")))]
|
|
145
|
-
{
|
|
146
|
-
Pdfium::bind_to_system_library()
|
|
147
|
-
.map_err(|e| map_err(format!("Failed to create Pdfium bindings ({}): {}", context, e)))
|
|
148
|
-
}
|
|
149
117
|
}
|
|
150
118
|
|
|
151
119
|
#[cfg(test)]
|
|
@@ -158,6 +126,9 @@ mod tests {
|
|
|
158
126
|
// First call should initialize
|
|
159
127
|
let result = bind_pdfium(PdfError::TextExtractionFailed, "test context");
|
|
160
128
|
assert!(result.is_ok(), "First bind_pdfium call should succeed");
|
|
129
|
+
// Verify the returned Pdfium instance is usable
|
|
130
|
+
let pdfium = result.unwrap();
|
|
131
|
+
assert!(pdfium.is_pdfium_ready(), "Pdfium should be initialized");
|
|
161
132
|
}
|
|
162
133
|
|
|
163
134
|
#[test]
|
|
@@ -11,6 +11,7 @@ pub enum PdfError {
|
|
|
11
11
|
RenderingFailed(String),
|
|
12
12
|
MetadataExtractionFailed(String),
|
|
13
13
|
ExtractionFailed(String),
|
|
14
|
+
FontLoadingFailed(String),
|
|
14
15
|
IOError(String),
|
|
15
16
|
}
|
|
16
17
|
|
|
@@ -30,6 +31,7 @@ impl fmt::Display for PdfError {
|
|
|
30
31
|
write!(f, "Metadata extraction failed: {}", msg)
|
|
31
32
|
}
|
|
32
33
|
PdfError::ExtractionFailed(msg) => write!(f, "Extraction failed: {}", msg),
|
|
34
|
+
PdfError::FontLoadingFailed(msg) => write!(f, "Font loading failed: {}", msg),
|
|
33
35
|
PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
|
|
34
36
|
}
|
|
35
37
|
}
|
|
@@ -50,6 +52,34 @@ impl From<lopdf::Error> for PdfError {
|
|
|
50
52
|
|
|
51
53
|
pub type Result<T> = std::result::Result<T, PdfError>;
|
|
52
54
|
|
|
55
|
+
/// Format a pdfium error for display.
|
|
56
|
+
///
|
|
57
|
+
/// The kreuzberg-pdfium-render fork's error type doesn't implement Display,
|
|
58
|
+
/// so Debug formatting produces messages like "PdfiumLibraryInternalError(FormatError,)"
|
|
59
|
+
/// with trailing commas and parentheses. This function cleans up the formatting.
|
|
60
|
+
pub(crate) fn format_pdfium_error<E: std::fmt::Debug>(error: E) -> String {
|
|
61
|
+
let debug_msg = format!("{:?}", error);
|
|
62
|
+
|
|
63
|
+
// Extract the variant name and clean up Debug formatting
|
|
64
|
+
// "PdfiumLibraryInternalError(FormatError,)" -> "PdfiumLibraryInternalError: FormatError"
|
|
65
|
+
// "SomeError" -> "SomeError"
|
|
66
|
+
if let Some(paren_idx) = debug_msg.find('(') {
|
|
67
|
+
let variant = &debug_msg[..paren_idx];
|
|
68
|
+
let inner = &debug_msg[paren_idx + 1..];
|
|
69
|
+
|
|
70
|
+
// Remove trailing ",)" or ")"
|
|
71
|
+
let inner_clean = inner.trim_end_matches(')').trim_end_matches(',');
|
|
72
|
+
|
|
73
|
+
if inner_clean.is_empty() {
|
|
74
|
+
variant.to_string()
|
|
75
|
+
} else {
|
|
76
|
+
format!("{}: {}", variant, inner_clean)
|
|
77
|
+
}
|
|
78
|
+
} else {
|
|
79
|
+
debug_msg
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
53
83
|
#[cfg(test)]
|
|
54
84
|
mod tests {
|
|
55
85
|
use super::*;
|
|
@@ -127,4 +157,71 @@ mod tests {
|
|
|
127
157
|
let err = PdfError::ExtractionFailed("page data mismatch".to_string());
|
|
128
158
|
assert_eq!(err.to_string(), "Extraction failed: page data mismatch");
|
|
129
159
|
}
|
|
160
|
+
|
|
161
|
+
#[test]
|
|
162
|
+
fn test_font_loading_failed_error() {
|
|
163
|
+
let err = PdfError::FontLoadingFailed("missing font file".to_string());
|
|
164
|
+
assert_eq!(err.to_string(), "Font loading failed: missing font file");
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#[test]
|
|
168
|
+
fn test_format_pdfium_error_with_inner_value() {
|
|
169
|
+
// Simulate pdfium error: "PdfiumLibraryInternalError(FormatError,)"
|
|
170
|
+
#[derive(Debug)]
|
|
171
|
+
#[allow(dead_code)]
|
|
172
|
+
struct MockError(String);
|
|
173
|
+
|
|
174
|
+
let error = MockError("FormatError,".to_string());
|
|
175
|
+
let formatted = format_pdfium_error(error);
|
|
176
|
+
// Should clean up the trailing comma
|
|
177
|
+
assert!(formatted.contains("MockError"));
|
|
178
|
+
assert!(formatted.contains("FormatError"));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn test_format_pdfium_error_simple() {
|
|
183
|
+
// Simulate simple error without parentheses
|
|
184
|
+
#[derive(Debug)]
|
|
185
|
+
struct SimpleError;
|
|
186
|
+
|
|
187
|
+
let formatted = format_pdfium_error(SimpleError);
|
|
188
|
+
assert_eq!(formatted, "SimpleError");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
#[test]
|
|
192
|
+
fn test_format_pdfium_error_empty_inner() {
|
|
193
|
+
// Simulate error with empty inner: "SomeError()"
|
|
194
|
+
#[derive(Debug)]
|
|
195
|
+
struct EmptyInner;
|
|
196
|
+
|
|
197
|
+
let formatted = format_pdfium_error(EmptyInner);
|
|
198
|
+
// Will be "EmptyInner" since the formatting doesn't add parentheses
|
|
199
|
+
assert_eq!(formatted, "EmptyInner");
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
#[test]
|
|
203
|
+
fn test_format_pdfium_error_cleans_trailing_comma() {
|
|
204
|
+
// This test simulates the actual pdfium error format
|
|
205
|
+
// "PdfiumLibraryInternalError(FormatError,)" should become
|
|
206
|
+
// "PdfiumLibraryInternalError: FormatError"
|
|
207
|
+
#[derive(Debug)]
|
|
208
|
+
#[allow(dead_code)]
|
|
209
|
+
enum PdfiumError {
|
|
210
|
+
PdfiumLibraryInternalError(InternalError),
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#[derive(Debug)]
|
|
214
|
+
#[allow(dead_code)]
|
|
215
|
+
enum InternalError {
|
|
216
|
+
FormatError,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
let error = PdfiumError::PdfiumLibraryInternalError(InternalError::FormatError);
|
|
220
|
+
let formatted = format_pdfium_error(error);
|
|
221
|
+
|
|
222
|
+
// Should not contain trailing comma or redundant parentheses
|
|
223
|
+
assert!(!formatted.contains(",)"));
|
|
224
|
+
assert!(formatted.contains("PdfiumLibraryInternalError"));
|
|
225
|
+
assert!(formatted.contains("FormatError"));
|
|
226
|
+
}
|
|
130
227
|
}
|