kreuzberg 4.0.0.pre.rc.19 → 4.0.0.pre.rc.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f5df1c1138122d449d77193b97ee6c4f40de044077765f1d68ce4f0bc6aba2a
4
- data.tar.gz: c48abedda657f892a912cd9cca7f40167fa3257d75f98527e0bc95da4580e630
3
+ metadata.gz: bc17da6af86a2e71001e5e5844c764fbabdc29e1b777b368d179fd505f8440f1
4
+ data.tar.gz: eae32510b4c628b8aa73dd943fc0adf60c316efc2e20d2910e84a0f60bf9da24
5
5
  SHA512:
6
- metadata.gz: a2a0a7854003f48d69eb89cf79a3252aadba11f001edfe7ba4d03f16198b3d68394bd84589c5b379c7a4dcd4784391a2fd3b1c5ce636d8a490382a77d62fd671
7
- data.tar.gz: f3d571515eb5598e34fdc8dd18296cd069a6fa25e7cf9017a9f3f1980a82fcebca977e9fc18e361d0f00386f72109ffe8f3e1afcf15dcbc35b5e6472b3f83853
6
+ metadata.gz: 869652660b57b61427d58ef01d832efc6854aa20c232db1c5fe0d1ce8601a70368c675d4b904be18d86f5b8d8b4730dbbfb050744457b03cf69063060c7521e8
7
+ data.tar.gz: 0c534272a1ddf5427221fde1917ab682b86adc7c1773d49865612240134b7576959f2bb2e4101fd00a5193016e6e141e0d01d1d29cfd82e4e1f25d057185be8a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.19)
4
+ kreuzberg (4.0.0.pre.rc.21)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -66,7 +66,7 @@ GEM
66
66
  rb-fsevent (0.11.2)
67
67
  rb-inotify (0.11.1)
68
68
  ffi (~> 1.0)
69
- rb_sys (0.9.123)
69
+ rb_sys (0.9.119)
70
70
  rake-compiler-dock (= 1.10.0)
71
71
  rbs (3.10.0)
72
72
  logger
@@ -147,7 +147,7 @@ DEPENDENCIES
147
147
  pry-byebug (~> 3.10)
148
148
  rake (~> 13.0)
149
149
  rake-compiler (~> 1.2)
150
- rb_sys (~> 0.9.119)
150
+ rb_sys (= 0.9.119)
151
151
  rbs (~> 3.0)
152
152
  rspec (~> 3.12)
153
153
  rubocop (~> 1.66)
@@ -2540,7 +2540,7 @@ dependencies = [
2540
2540
 
2541
2541
  [[package]]
2542
2542
  name = "kreuzberg-rb"
2543
- version = "4.0.0-rc.19"
2543
+ version = "4.0.0-rc.20"
2544
2544
  dependencies = [
2545
2545
  "async-trait",
2546
2546
  "html-to-markdown-rs",
@@ -3,7 +3,7 @@
3
3
 
4
4
  [package]
5
5
  name = "kreuzberg-rb"
6
- version = "4.0.0-rc.19"
6
+ version = "4.0.0-rc.21"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -670,6 +670,8 @@ fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorC
670
670
  enabled,
671
671
  enabled_processors,
672
672
  disabled_processors,
673
+ enabled_set: None,
674
+ disabled_set: None,
673
675
  };
674
676
 
675
677
  Ok(config)
data/kreuzberg.gemspec CHANGED
@@ -201,7 +201,7 @@ Gem::Specification.new do |spec|
201
201
  spec.add_development_dependency 'bundler', '~> 4.0'
202
202
  spec.add_development_dependency 'rake', '~> 13.0'
203
203
  spec.add_development_dependency 'rake-compiler', '~> 1.2'
204
- spec.add_development_dependency 'rb_sys', '~> 0.9.119'
204
+ spec.add_development_dependency 'rb_sys', '0.9.119'
205
205
  spec.add_development_dependency 'rspec', '~> 3.12'
206
206
  unless Gem.win_platform?
207
207
  spec.add_development_dependency 'rbs', '~> 3.0'
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.19'
4
+ VERSION = '4.0.0-rc.21'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.0.0-rc.19"
5
+ version = "4.0.0-rc.21"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.0.0-rc.19"
3
+ version = "4.0.0-rc.21"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -167,7 +167,10 @@ reqwest = { version = "0.12.25", default-features = false, features = [
167
167
  "rustls-tls",
168
168
  ], optional = true }
169
169
  # Format extractors (optional)
170
- pdfium-render = { version = "0.8.37", features = ["thread_safe", "image_latest"], optional = true }
170
+ pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
171
+ "thread_safe",
172
+ "image_latest",
173
+ ], optional = true }
171
174
  lopdf = { version = "0.38.0", optional = true }
172
175
  calamine = { version = "0.32.0", features = ["dates"], optional = true }
173
176
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
@@ -0,0 +1,74 @@
1
+ use kreuzberg::{ExtractionConfig, extract_file_sync};
2
+ use std::path::PathBuf;
3
+ use std::time::Instant;
4
+
5
+ fn main() -> Result<(), Box<dyn std::error::Error>> {
6
+ let test_pdfs = [
7
+ (
8
+ "a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
9
+ "Academic Paper (18 fonts)",
10
+ ),
11
+ (
12
+ "5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf",
13
+ "Intel PDF (5 fonts)",
14
+ ),
15
+ ("fake_memo.pdf", "Tiny Memo (3-5 fonts)"),
16
+ ];
17
+
18
+ let config = ExtractionConfig {
19
+ use_cache: false,
20
+ ..Default::default()
21
+ };
22
+
23
+ println!("=== PDFium Fork Fixes Benchmark ===\n");
24
+ println!("Testing warm execution fix and font overhead fix\n");
25
+
26
+ for (file, description) in &test_pdfs {
27
+ let path = PathBuf::from(format!("test_documents/pdfs/{}", file));
28
+ println!("=== {} ===", description);
29
+ println!("File: {}\n", file);
30
+
31
+ // Cold start
32
+ let start = Instant::now();
33
+ let result = extract_file_sync(&path, None, &config)?;
34
+ let cold = start.elapsed();
35
+ println!("Cold start: {:>8.2} ms", cold.as_secs_f64() * 1000.0);
36
+ println!("Text length: {} chars\n", result.content.len());
37
+
38
+ // Warm iterations
39
+ let mut warm_times = Vec::new();
40
+ for i in 1..=5 {
41
+ let start = Instant::now();
42
+ let _ = extract_file_sync(&path, None, &config)?;
43
+ let warm = start.elapsed();
44
+ warm_times.push(warm);
45
+ let speedup = cold.as_micros() as f64 / warm.as_micros() as f64;
46
+ println!(
47
+ "Warm {:>2}: {:>8.2} ms ({:>5.2}x faster than cold)",
48
+ i,
49
+ warm.as_secs_f64() * 1000.0,
50
+ speedup
51
+ );
52
+ }
53
+
54
+ // Statistics
55
+ let avg_warm = warm_times.iter().sum::<std::time::Duration>() / warm_times.len() as u32;
56
+ let avg_speedup = cold.as_micros() as f64 / avg_warm.as_micros() as f64;
57
+ println!(
58
+ "\nAverage warm: {:>8.2} ms ({:>5.2}x faster than cold)",
59
+ avg_warm.as_secs_f64() * 1000.0,
60
+ avg_speedup
61
+ );
62
+ println!("\n{}\n", "=".repeat(60));
63
+ }
64
+
65
+ println!("\n=== Success Criteria ===");
66
+ println!("✓ Warm Execution Fix:");
67
+ println!(" - Warm times should be 1-3x faster than cold (realistic)");
68
+ println!(" - NOT 100-700x faster (the bug we fixed)");
69
+ println!("\n✓ Font Overhead Fix:");
70
+ println!(" - Academic Paper cold: ~130-145ms (matches baseline)");
71
+ println!(" - NOT 180-195ms (the regression we fixed)");
72
+
73
+ Ok(())
74
+ }
@@ -0,0 +1,65 @@
1
+ use kreuzberg::{ExtractionConfig, extract_file};
2
+ use std::time::Instant;
3
+
4
+ #[tokio::main]
5
+ async fn main() {
6
+ let config = ExtractionConfig {
7
+ use_cache: false,
8
+ ..Default::default()
9
+ };
10
+
11
+ println!("Testing PDF extraction with cleaned pdfium-render fork...\n");
12
+
13
+ // Test 1: Simple extraction
14
+ println!("Test 1: fake_memo.pdf");
15
+ let start = Instant::now();
16
+ match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
17
+ Ok(result) => {
18
+ let duration = start.elapsed();
19
+ println!(" ✓ Success! Duration: {:?}", duration);
20
+ println!(" ✓ Text length: {} chars", result.content.len());
21
+ }
22
+ Err(e) => {
23
+ println!(" ✗ Failed: {}", e);
24
+ std::process::exit(1);
25
+ }
26
+ }
27
+
28
+ // Test 2: Warm iteration
29
+ println!("\nTest 2: Warm iteration");
30
+ let start = Instant::now();
31
+ match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
32
+ Ok(result) => {
33
+ let duration = start.elapsed();
34
+ println!(" ✓ Success! Duration: {:?}", duration);
35
+ println!(" ✓ Text length: {} chars", result.content.len());
36
+ }
37
+ Err(e) => {
38
+ println!(" ✗ Failed: {}", e);
39
+ std::process::exit(1);
40
+ }
41
+ }
42
+
43
+ // Test 3: Academic Paper (font-heavy)
44
+ println!("\nTest 3: Academic Paper (18 fonts)");
45
+ let start = Instant::now();
46
+ match extract_file(
47
+ "test_documents/pdfs/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
48
+ None,
49
+ &config,
50
+ )
51
+ .await
52
+ {
53
+ Ok(result) => {
54
+ let duration = start.elapsed();
55
+ println!(" ✓ Success! Duration: {:?}", duration);
56
+ println!(" ✓ Text length: {} chars", result.content.len());
57
+ }
58
+ Err(e) => {
59
+ println!(" ✗ Failed: {}", e);
60
+ std::process::exit(1);
61
+ }
62
+ }
63
+
64
+ println!("\n✅ All tests passed! Cleaned pdfium-render fork is working correctly.");
65
+ }
@@ -1254,6 +1254,9 @@ enable_quality_processing = true
1254
1254
  let config1 = ExtractionConfig::from_toml_file(&config_path).unwrap();
1255
1255
  assert!(!config1.use_cache);
1256
1256
 
1257
+ // Sleep to ensure mtime changes (some filesystems have 1-second granularity)
1258
+ std::thread::sleep(std::time::Duration::from_secs(1));
1259
+
1257
1260
  fs::write(
1258
1261
  &config_path,
1259
1262
  r#"
@@ -293,49 +293,32 @@ impl EpubExtractor {
293
293
  }
294
294
 
295
295
  /// Extract metadata from EPUB OPF file
296
- fn extract_metadata(opf_xml: &str) -> Result<BTreeMap<String, serde_json::Value>> {
297
- let mut metadata = BTreeMap::new();
296
+ fn extract_metadata(opf_xml: &str) -> Result<(OepbMetadata, BTreeMap<String, serde_json::Value>)> {
297
+ let mut additional_metadata = BTreeMap::new();
298
298
 
299
299
  let (epub_metadata, _) = Self::parse_opf(opf_xml)?;
300
300
 
301
- if let Some(title) = epub_metadata.title {
302
- metadata.insert("title".to_string(), serde_json::json!(title));
301
+ if let Some(identifier) = epub_metadata.identifier.clone() {
302
+ additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier));
303
303
  }
304
304
 
305
- if let Some(creator) = epub_metadata.creator {
306
- metadata.insert("creator".to_string(), serde_json::json!(creator.clone()));
307
- metadata.insert("authors".to_string(), serde_json::json!(vec![creator]));
305
+ if let Some(publisher) = epub_metadata.publisher.clone() {
306
+ additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher));
308
307
  }
309
308
 
310
- if let Some(date) = epub_metadata.date {
311
- metadata.insert("date".to_string(), serde_json::json!(date));
309
+ if let Some(subject) = epub_metadata.subject.clone() {
310
+ additional_metadata.insert("subject".to_string(), serde_json::json!(subject));
312
311
  }
313
312
 
314
- if let Some(language) = epub_metadata.language {
315
- metadata.insert("language".to_string(), serde_json::json!(language));
313
+ if let Some(description) = epub_metadata.description.clone() {
314
+ additional_metadata.insert("description".to_string(), serde_json::json!(description));
316
315
  }
317
316
 
318
- if let Some(identifier) = epub_metadata.identifier {
319
- metadata.insert("identifier".to_string(), serde_json::json!(identifier));
317
+ if let Some(rights) = epub_metadata.rights.clone() {
318
+ additional_metadata.insert("rights".to_string(), serde_json::json!(rights));
320
319
  }
321
320
 
322
- if let Some(publisher) = epub_metadata.publisher {
323
- metadata.insert("publisher".to_string(), serde_json::json!(publisher));
324
- }
325
-
326
- if let Some(subject) = epub_metadata.subject {
327
- metadata.insert("subject".to_string(), serde_json::json!(subject));
328
- }
329
-
330
- if let Some(description) = epub_metadata.description {
331
- metadata.insert("description".to_string(), serde_json::json!(description));
332
- }
333
-
334
- if let Some(rights) = epub_metadata.rights {
335
- metadata.insert("rights".to_string(), serde_json::json!(rights));
336
- }
337
-
338
- Ok(metadata)
321
+ Ok((epub_metadata, additional_metadata))
339
322
  }
340
323
 
341
324
  /// Parse container.xml to find the OPF file path
@@ -564,13 +547,18 @@ impl DocumentExtractor for EpubExtractor {
564
547
 
565
548
  let extracted_content = Self::extract_content(&mut archive, &opf_path, &manifest_dir)?;
566
549
 
567
- let metadata_btree = Self::extract_metadata(&opf_xml)?;
568
- let metadata_map: std::collections::HashMap<String, serde_json::Value> = metadata_btree.into_iter().collect();
550
+ let (epub_metadata, additional_metadata) = Self::extract_metadata(&opf_xml)?;
551
+ let metadata_map: std::collections::HashMap<String, serde_json::Value> =
552
+ additional_metadata.into_iter().collect();
569
553
 
570
554
  Ok(ExtractionResult {
571
555
  content: extracted_content,
572
556
  mime_type: mime_type.to_string(),
573
557
  metadata: Metadata {
558
+ title: epub_metadata.title,
559
+ authors: epub_metadata.creator.map(|c| vec![c]),
560
+ language: epub_metadata.language,
561
+ created_at: epub_metadata.date,
574
562
  additional: metadata_map,
575
563
  ..Default::default()
576
564
  },
@@ -392,26 +392,26 @@ impl DocumentExtractor for PdfExtractor {
392
392
  {
393
393
  // For WASM targets, PDFium must be properly initialized in the environment.
394
394
  // The error message will direct users to the documentation for setup requirements.
395
- let bindings =
396
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
397
- .map_err(|pdf_err| {
398
- // Provide context-specific error for WASM PDF failures
399
- if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
400
- crate::error::KreuzbergError::Parsing {
401
- message: "PDF extraction requires proper WASM module initialization. \
402
- Ensure your WASM environment is set up with PDFium support. \
403
- See: https://docs.kreuzberg.dev/wasm/pdf"
404
- .to_string(),
405
- source: None,
406
- }
407
- } else {
408
- pdf_err.into()
395
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium").map_err(
396
+ |pdf_err| {
397
+ // Provide context-specific error for WASM PDF failures
398
+ if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
399
+ crate::error::KreuzbergError::Parsing {
400
+ message: "PDF extraction requires proper WASM module initialization. \
401
+ Ensure your WASM environment is set up with PDFium support. \
402
+ See: https://docs.kreuzberg.dev/wasm/pdf"
403
+ .to_string(),
404
+ source: None,
409
405
  }
410
- })?;
411
- let pdfium = Pdfium::new(bindings);
406
+ } else {
407
+ pdf_err.into()
408
+ }
409
+ },
410
+ )?;
411
+ let pdfium = Pdfium;
412
412
 
413
413
  let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
414
- let err_msg = e.to_string();
414
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
415
415
  if err_msg.contains("password") || err_msg.contains("Password") {
416
416
  PdfError::PasswordRequired
417
417
  } else {
@@ -431,13 +431,12 @@ impl DocumentExtractor for PdfExtractor {
431
431
  tokio::task::spawn_blocking(move || {
432
432
  let _guard = span.entered();
433
433
 
434
- let bindings =
435
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
434
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
436
435
 
437
- let pdfium = Pdfium::new(bindings);
436
+ let pdfium = Pdfium;
438
437
 
439
438
  let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
440
- let err_msg = e.to_string();
439
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
441
440
  if err_msg.contains("password") || err_msg.contains("Password") {
442
441
  PdfError::PasswordRequired
443
442
  } else {
@@ -464,13 +463,12 @@ impl DocumentExtractor for PdfExtractor {
464
463
  .await
465
464
  .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
466
465
  } else {
467
- let bindings =
468
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
466
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
469
467
 
470
- let pdfium = Pdfium::new(bindings);
468
+ let pdfium = Pdfium;
471
469
 
472
470
  let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
473
- let err_msg = e.to_string();
471
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
474
472
  if err_msg.contains("password") || err_msg.contains("Password") {
475
473
  PdfError::PasswordRequired
476
474
  } else {
@@ -484,13 +482,12 @@ impl DocumentExtractor for PdfExtractor {
484
482
  }
485
483
  #[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
486
484
  {
487
- let bindings =
488
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
485
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
489
486
 
490
- let pdfium = Pdfium::new(bindings);
487
+ let pdfium = Pdfium;
491
488
 
492
489
  let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
493
- let err_msg = e.to_string();
490
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
494
491
  if err_msg.contains("password") || err_msg.contains("Password") {
495
492
  PdfError::PasswordRequired
496
493
  } else {
@@ -82,10 +82,7 @@ fn bind_pdfium_impl() -> Result<(Option<PathBuf>, Box<dyn PdfiumLibraryBindings>
82
82
  /// Instead of failing permanently, we recover by extracting the inner value from the
83
83
  /// poisoned lock and proceeding. This ensures PDF extraction can continue even if an
84
84
  /// earlier panic occurred, as long as the state is consistent.
85
- pub(crate) fn bind_pdfium(
86
- map_err: fn(String) -> PdfError,
87
- context: &'static str,
88
- ) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
85
+ pub(crate) fn bind_pdfium(map_err: fn(String) -> PdfError, context: &'static str) -> Result<Pdfium, PdfError> {
89
86
  let mut state = PDFIUM_STATE.lock().unwrap_or_else(|poisoned| {
90
87
  // SAFETY: Recovering from a poisoned lock is safe here because:
91
88
  // 1. The poisoned state still contains valid data (just a guard from a panicked thread)
@@ -97,55 +94,26 @@ pub(crate) fn bind_pdfium(
97
94
  // Initialize on first call
98
95
  match &*state {
99
96
  InitializationState::Uninitialized => match bind_pdfium_impl() {
100
- Ok((lib_dir, _bindings)) => {
97
+ Ok((lib_dir, bindings)) => {
98
+ // Initialize Pdfium singleton with the bindings and return it
99
+ let pdfium = Pdfium::new(bindings);
101
100
  *state = InitializationState::Initialized { lib_dir };
101
+ Ok(pdfium)
102
102
  }
103
103
  Err(err) => {
104
104
  *state = InitializationState::Failed(err.clone());
105
- return Err(map_err(format!("Pdfium initialization failed ({}): {}", context, err)));
105
+ Err(map_err(format!("Pdfium initialization failed ({}): {}", context, err)))
106
106
  }
107
107
  },
108
- InitializationState::Failed(err) => {
109
- return Err(map_err(format!(
110
- "Pdfium initialization previously failed ({}): {}",
111
- context, err
112
- )));
113
- }
108
+ InitializationState::Failed(err) => Err(map_err(format!(
109
+ "Pdfium initialization previously failed ({}): {}",
110
+ context, err
111
+ ))),
114
112
  InitializationState::Initialized { .. } => {
115
- // Already initialized, proceed to create bindings below
116
- }
117
- }
118
-
119
- // Create fresh bindings from cached state
120
- #[cfg(all(feature = "pdf", feature = "bundled-pdfium", not(target_arch = "wasm32")))]
121
- {
122
- match &*state {
123
- InitializationState::Initialized { lib_dir: Some(lib_dir) } => {
124
- Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
125
- .map_err(|e| map_err(format!("Failed to create Pdfium bindings ({}): {}", context, e)))
126
- }
127
- _ => {
128
- // This should not happen as state is guaranteed to be Initialized here
129
- Err(map_err(format!(
130
- "Internal error: Pdfium state not properly initialized ({})",
131
- context
132
- )))
133
- }
113
+ // Already initialized, return a new accessor to the singleton
114
+ Ok(Pdfium)
134
115
  }
135
116
  }
136
-
137
- // For system pdfium or WASM, create fresh bindings
138
- #[cfg(all(feature = "pdf", feature = "bundled-pdfium", target_arch = "wasm32"))]
139
- {
140
- Pdfium::bind_to_system_library()
141
- .map_err(|e| map_err(format!("Failed to create Pdfium bindings ({}): {}", context, e)))
142
- }
143
-
144
- #[cfg(all(feature = "pdf", not(feature = "bundled-pdfium")))]
145
- {
146
- Pdfium::bind_to_system_library()
147
- .map_err(|e| map_err(format!("Failed to create Pdfium bindings ({}): {}", context, e)))
148
- }
149
117
  }
150
118
 
151
119
  #[cfg(test)]
@@ -158,6 +126,9 @@ mod tests {
158
126
  // First call should initialize
159
127
  let result = bind_pdfium(PdfError::TextExtractionFailed, "test context");
160
128
  assert!(result.is_ok(), "First bind_pdfium call should succeed");
129
+ // Verify the returned Pdfium instance is usable
130
+ let pdfium = result.unwrap();
131
+ assert!(pdfium.is_pdfium_ready(), "Pdfium should be initialized");
161
132
  }
162
133
 
163
134
  #[test]
@@ -11,6 +11,7 @@ pub enum PdfError {
11
11
  RenderingFailed(String),
12
12
  MetadataExtractionFailed(String),
13
13
  ExtractionFailed(String),
14
+ FontLoadingFailed(String),
14
15
  IOError(String),
15
16
  }
16
17
 
@@ -30,6 +31,7 @@ impl fmt::Display for PdfError {
30
31
  write!(f, "Metadata extraction failed: {}", msg)
31
32
  }
32
33
  PdfError::ExtractionFailed(msg) => write!(f, "Extraction failed: {}", msg),
34
+ PdfError::FontLoadingFailed(msg) => write!(f, "Font loading failed: {}", msg),
33
35
  PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
34
36
  }
35
37
  }
@@ -50,6 +52,34 @@ impl From<lopdf::Error> for PdfError {
50
52
 
51
53
  pub type Result<T> = std::result::Result<T, PdfError>;
52
54
 
55
+ /// Format a pdfium error for display.
56
+ ///
57
+ /// The kreuzberg-pdfium-render fork's error type doesn't implement Display,
58
+ /// so Debug formatting produces messages like "PdfiumLibraryInternalError(FormatError,)"
59
+ /// with trailing commas and parentheses. This function cleans up the formatting.
60
+ pub(crate) fn format_pdfium_error<E: std::fmt::Debug>(error: E) -> String {
61
+ let debug_msg = format!("{:?}", error);
62
+
63
+ // Extract the variant name and clean up Debug formatting
64
+ // "PdfiumLibraryInternalError(FormatError,)" -> "PdfiumLibraryInternalError: FormatError"
65
+ // "SomeError" -> "SomeError"
66
+ if let Some(paren_idx) = debug_msg.find('(') {
67
+ let variant = &debug_msg[..paren_idx];
68
+ let inner = &debug_msg[paren_idx + 1..];
69
+
70
+ // Remove trailing ",)" or ")"
71
+ let inner_clean = inner.trim_end_matches(')').trim_end_matches(',');
72
+
73
+ if inner_clean.is_empty() {
74
+ variant.to_string()
75
+ } else {
76
+ format!("{}: {}", variant, inner_clean)
77
+ }
78
+ } else {
79
+ debug_msg
80
+ }
81
+ }
82
+
53
83
  #[cfg(test)]
54
84
  mod tests {
55
85
  use super::*;
@@ -127,4 +157,71 @@ mod tests {
127
157
  let err = PdfError::ExtractionFailed("page data mismatch".to_string());
128
158
  assert_eq!(err.to_string(), "Extraction failed: page data mismatch");
129
159
  }
160
+
161
+ #[test]
162
+ fn test_font_loading_failed_error() {
163
+ let err = PdfError::FontLoadingFailed("missing font file".to_string());
164
+ assert_eq!(err.to_string(), "Font loading failed: missing font file");
165
+ }
166
+
167
+ #[test]
168
+ fn test_format_pdfium_error_with_inner_value() {
169
+ // Simulate pdfium error: "PdfiumLibraryInternalError(FormatError,)"
170
+ #[derive(Debug)]
171
+ #[allow(dead_code)]
172
+ struct MockError(String);
173
+
174
+ let error = MockError("FormatError,".to_string());
175
+ let formatted = format_pdfium_error(error);
176
+ // Should clean up the trailing comma
177
+ assert!(formatted.contains("MockError"));
178
+ assert!(formatted.contains("FormatError"));
179
+ }
180
+
181
+ #[test]
182
+ fn test_format_pdfium_error_simple() {
183
+ // Simulate simple error without parentheses
184
+ #[derive(Debug)]
185
+ struct SimpleError;
186
+
187
+ let formatted = format_pdfium_error(SimpleError);
188
+ assert_eq!(formatted, "SimpleError");
189
+ }
190
+
191
+ #[test]
192
+ fn test_format_pdfium_error_empty_inner() {
193
+ // Simulate error with empty inner: "SomeError()"
194
+ #[derive(Debug)]
195
+ struct EmptyInner;
196
+
197
+ let formatted = format_pdfium_error(EmptyInner);
198
+ // Will be "EmptyInner" since the formatting doesn't add parentheses
199
+ assert_eq!(formatted, "EmptyInner");
200
+ }
201
+
202
+ #[test]
203
+ fn test_format_pdfium_error_cleans_trailing_comma() {
204
+ // This test simulates the actual pdfium error format
205
+ // "PdfiumLibraryInternalError(FormatError,)" should become
206
+ // "PdfiumLibraryInternalError: FormatError"
207
+ #[derive(Debug)]
208
+ #[allow(dead_code)]
209
+ enum PdfiumError {
210
+ PdfiumLibraryInternalError(InternalError),
211
+ }
212
+
213
+ #[derive(Debug)]
214
+ #[allow(dead_code)]
215
+ enum InternalError {
216
+ FormatError,
217
+ }
218
+
219
+ let error = PdfiumError::PdfiumLibraryInternalError(InternalError::FormatError);
220
+ let formatted = format_pdfium_error(error);
221
+
222
+ // Should not contain trailing comma or redundant parentheses
223
+ assert!(!formatted.contains(",)"));
224
+ assert!(formatted.contains("PdfiumLibraryInternalError"));
225
+ assert!(formatted.contains("FormatError"));
226
+ }
130
227
  }