kreuzberg 4.6.0 → 4.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +83 -14
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -1
  5. data/ext/kreuzberg_rb/native/src/config/types.rs +18 -0
  6. data/ext/kreuzberg_rb/native/src/extraction.rs +30 -0
  7. data/ext/kreuzberg_rb/native/src/lib.rs +5 -1
  8. data/lib/kreuzberg/config.rb +22 -8
  9. data/lib/kreuzberg/extraction_api.rb +37 -0
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +5 -1
  12. data/spec/binding/render_spec.rb +91 -0
  13. data/vendor/Cargo.toml +3 -3
  14. data/vendor/kreuzberg/Cargo.toml +6 -3
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +1 -0
  17. data/vendor/kreuzberg/src/api/openapi.rs +6 -0
  18. data/vendor/kreuzberg/src/core/config/extraction/core.rs +28 -0
  19. data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
  20. data/vendor/kreuzberg/src/core/extractor/batch.rs +23 -4
  21. data/vendor/kreuzberg/src/core/mime.rs +12 -0
  22. data/vendor/kreuzberg/src/error.rs +3 -0
  23. data/vendor/kreuzberg/src/extraction/mod.rs +6 -0
  24. data/vendor/kreuzberg/src/extraction/pst.rs +387 -0
  25. data/vendor/kreuzberg/src/extraction/structured.rs +214 -1
  26. data/vendor/kreuzberg/src/extraction/transform/content.rs +40 -7
  27. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +69 -12
  28. data/vendor/kreuzberg/src/extraction/transform/mod.rs +159 -6
  29. data/vendor/kreuzberg/src/extractors/email.rs +3 -3
  30. data/vendor/kreuzberg/src/extractors/mod.rs +12 -2
  31. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +122 -53
  32. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +562 -194
  33. data/vendor/kreuzberg/src/extractors/pst.rs +264 -0
  34. data/vendor/kreuzberg/src/extractors/structured.rs +14 -4
  35. data/vendor/kreuzberg/src/mcp/errors.rs +5 -0
  36. data/vendor/kreuzberg/src/pdf/error.rs +18 -0
  37. data/vendor/kreuzberg/src/pdf/layout_runner.rs +214 -226
  38. data/vendor/kreuzberg/src/pdf/mod.rs +1 -1
  39. data/vendor/kreuzberg/src/pdf/rendering.rs +358 -32
  40. data/vendor/kreuzberg/src/pdf/text.rs +41 -2
  41. data/vendor/kreuzberg/test_documents/jsonl/simple.jsonl +3 -0
  42. data/vendor/kreuzberg/test_documents/jsonl/with_blanks.jsonl +5 -0
  43. data/vendor/kreuzberg/tests/api_consistency.rs +7 -0
  44. data/vendor/kreuzberg/tests/jsonl_integration.rs +82 -0
  45. data/vendor/kreuzberg/tests/pst_integration.rs +82 -0
  46. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  47. data/vendor/kreuzberg-ffi/kreuzberg.h +149 -2
  48. data/vendor/kreuzberg-ffi/src/config/merge.rs +7 -0
  49. data/vendor/kreuzberg-ffi/src/config_builder.rs +37 -0
  50. data/vendor/kreuzberg-ffi/src/lib.rs +6 -0
  51. data/vendor/kreuzberg-ffi/src/rendering.rs +325 -0
  52. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  53. data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +15 -4
  54. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  55. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  56. metadata +10 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 203e9719bcf3cf2cda1252dcd7a5c5782e7b73936a304b626a351894d4fcd909
4
- data.tar.gz: 2c02a45c882ef6b6b6935896e9334c46f012aaf8bc6f6669fa3c0110b67398e5
3
+ metadata.gz: f087c152499deb621223ba54a8ca450bb0510da3b43c880f3969e33ec4b4d5e5
4
+ data.tar.gz: dba38092babe378ec93e0dd4f307ab44c10519623302fdf14023a52eff6549b2
5
5
  SHA512:
6
- metadata.gz: d3dde81c8c38b1ee99bed3cae32e477e4c8941d401c6449fc9c3eec3608a5b771b47c20ab3a9679ccf75059fed5e6c09f9d91eefed83a7d9dc59eebf7acb5626
7
- data.tar.gz: e590247800d9752175985ee3b8ad0c89c5926f1afa0669a881cf476455c8514332880d30ff35d46a2836cf9cdc18b752296fb06a545f42129b548b5675180a71
6
+ metadata.gz: 2e2cb3a3636555ac5f7d54bf473e36872efef5cf1796c667f727fa8622792a5cec009799d35c5ccb776997b7c39a0e5d0745266c7c5e5604d22b4e3e3e2f5e1f
7
+ data.tar.gz: b032174ab9b1366d3d05082ef6cfe3b1b700ff66d265fb28353d69b581aee6989688dd8e01b8c516f7046a8968ca86edd8a80cb72cb48477607eb8104d7facb3
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -499,6 +499,15 @@ dependencies = [
499
499
  "generic-array",
500
500
  ]
501
501
 
502
+ [[package]]
503
+ name = "block-buffer"
504
+ version = "0.12.0"
505
+ source = "registry+https://github.com/rust-lang/crates.io-index"
506
+ checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
507
+ dependencies = [
508
+ "hybrid-array",
509
+ ]
510
+
502
511
  [[package]]
503
512
  name = "block-padding"
504
513
  version = "0.3.3"
@@ -726,7 +735,7 @@ version = "0.4.4"
726
735
  source = "registry+https://github.com/rust-lang/crates.io-index"
727
736
  checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
728
737
  dependencies = [
729
- "crypto-common",
738
+ "crypto-common 0.1.7",
730
739
  "inout",
731
740
  ]
732
741
 
@@ -877,6 +886,12 @@ dependencies = [
877
886
  "web-sys",
878
887
  ]
879
888
 
889
+ [[package]]
890
+ name = "const-oid"
891
+ version = "0.10.2"
892
+ source = "registry+https://github.com/rust-lang/crates.io-index"
893
+ checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
894
+
880
895
  [[package]]
881
896
  name = "const-random"
882
897
  version = "0.1.18"
@@ -1034,6 +1049,15 @@ dependencies = [
1034
1049
  "typenum",
1035
1050
  ]
1036
1051
 
1052
+ [[package]]
1053
+ name = "crypto-common"
1054
+ version = "0.2.1"
1055
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1056
+ checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
1057
+ dependencies = [
1058
+ "hybrid-array",
1059
+ ]
1060
+
1037
1061
  [[package]]
1038
1062
  name = "ctor"
1039
1063
  version = "0.6.3"
@@ -1232,8 +1256,19 @@ version = "0.10.7"
1232
1256
  source = "registry+https://github.com/rust-lang/crates.io-index"
1233
1257
  checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
1234
1258
  dependencies = [
1235
- "block-buffer",
1236
- "crypto-common",
1259
+ "block-buffer 0.10.4",
1260
+ "crypto-common 0.1.7",
1261
+ ]
1262
+
1263
+ [[package]]
1264
+ name = "digest"
1265
+ version = "0.11.2"
1266
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1267
+ checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c"
1268
+ dependencies = [
1269
+ "block-buffer 0.12.0",
1270
+ "const-oid",
1271
+ "crypto-common 0.2.1",
1237
1272
  ]
1238
1273
 
1239
1274
  [[package]]
@@ -2061,6 +2096,15 @@ version = "1.0.3"
2061
2096
  source = "registry+https://github.com/rust-lang/crates.io-index"
2062
2097
  checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
2063
2098
 
2099
+ [[package]]
2100
+ name = "hybrid-array"
2101
+ version = "0.4.8"
2102
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2103
+ checksum = "8655f91cd07f2b9d0c24137bd650fe69617773435ee5ec83022377777ce65ef1"
2104
+ dependencies = [
2105
+ "typenum",
2106
+ ]
2107
+
2064
2108
  [[package]]
2065
2109
  name = "hyper"
2066
2110
  version = "1.8.1"
@@ -2612,7 +2656,7 @@ dependencies = [
2612
2656
 
2613
2657
  [[package]]
2614
2658
  name = "kreuzberg"
2615
- version = "4.6.0"
2659
+ version = "4.6.2"
2616
2660
  dependencies = [
2617
2661
  "ahash",
2618
2662
  "async-trait",
@@ -2626,6 +2670,7 @@ dependencies = [
2626
2670
  "calamine",
2627
2671
  "cfb 0.14.0",
2628
2672
  "chardetng",
2673
+ "chrono",
2629
2674
  "dashmap",
2630
2675
  "dbase",
2631
2676
  "encoding_rs",
@@ -2658,6 +2703,7 @@ dependencies = [
2658
2703
  "opentelemetry_sdk",
2659
2704
  "org",
2660
2705
  "ort",
2706
+ "outlook-pst",
2661
2707
  "parking_lot",
2662
2708
  "pastey 0.2.1",
2663
2709
  "pkg-config",
@@ -2673,9 +2719,10 @@ dependencies = [
2673
2719
  "serde_json",
2674
2720
  "serde_yaml_ng",
2675
2721
  "sevenz-rust2",
2676
- "sha2",
2722
+ "sha2 0.11.0",
2677
2723
  "snap",
2678
2724
  "tar",
2725
+ "tempfile",
2679
2726
  "text-splitter",
2680
2727
  "thiserror 2.0.18",
2681
2728
  "tiff",
@@ -2696,7 +2743,7 @@ dependencies = [
2696
2743
 
2697
2744
  [[package]]
2698
2745
  name = "kreuzberg-ffi"
2699
- version = "4.6.0"
2746
+ version = "4.6.2"
2700
2747
  dependencies = [
2701
2748
  "ahash",
2702
2749
  "async-trait",
@@ -2712,7 +2759,7 @@ dependencies = [
2712
2759
 
2713
2760
  [[package]]
2714
2761
  name = "kreuzberg-paddle-ocr"
2715
- version = "4.6.0"
2762
+ version = "4.6.2"
2716
2763
  dependencies = [
2717
2764
  "geo-clipper",
2718
2765
  "geo-types",
@@ -2726,7 +2773,7 @@ dependencies = [
2726
2773
 
2727
2774
  [[package]]
2728
2775
  name = "kreuzberg-pdfium-render"
2729
- version = "4.6.0"
2776
+ version = "4.6.2"
2730
2777
  dependencies = [
2731
2778
  "bitflags",
2732
2779
  "bytemuck",
@@ -2749,7 +2796,7 @@ dependencies = [
2749
2796
 
2750
2797
  [[package]]
2751
2798
  name = "kreuzberg-rb"
2752
- version = "4.6.0"
2799
+ version = "4.6.2"
2753
2800
  dependencies = [
2754
2801
  "async-trait",
2755
2802
  "html-to-markdown-rs",
@@ -2766,7 +2813,7 @@ dependencies = [
2766
2813
 
2767
2814
  [[package]]
2768
2815
  name = "kreuzberg-tesseract"
2769
- version = "4.6.0"
2816
+ version = "4.6.2"
2770
2817
  dependencies = [
2771
2818
  "cc",
2772
2819
  "cmake",
@@ -2931,7 +2978,7 @@ dependencies = [
2931
2978
  "rand 0.10.0",
2932
2979
  "rangemap",
2933
2980
  "rayon",
2934
- "sha2",
2981
+ "sha2 0.10.9",
2935
2982
  "stringprep",
2936
2983
  "thiserror 2.0.18",
2937
2984
  "time",
@@ -3068,7 +3115,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
3068
3115
  checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
3069
3116
  dependencies = [
3070
3117
  "cfg-if",
3071
- "digest",
3118
+ "digest 0.10.7",
3072
3119
  ]
3073
3120
 
3074
3121
  [[package]]
@@ -3540,6 +3587,17 @@ dependencies = [
3540
3587
  "ureq 3.3.0",
3541
3588
  ]
3542
3589
 
3590
+ [[package]]
3591
+ name = "outlook-pst"
3592
+ version = "1.2.0"
3593
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3594
+ checksum = "299eace9d895cc31490927d7de779f8e8f3deb5fb6bd1e68eb192aa1da19bd47"
3595
+ dependencies = [
3596
+ "byteorder",
3597
+ "thiserror 2.0.18",
3598
+ "tracing",
3599
+ ]
3600
+
3543
3601
  [[package]]
3544
3602
  name = "parking_lot"
3545
3603
  version = "0.12.5"
@@ -4652,7 +4710,7 @@ dependencies = [
4652
4710
  "js-sys",
4653
4711
  "lzma-rust2 0.16.2",
4654
4712
  "ppmd-rust",
4655
- "sha2",
4713
+ "sha2 0.10.9",
4656
4714
  "wasm-bindgen",
4657
4715
  ]
4658
4716
 
@@ -4664,7 +4722,18 @@ checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
4664
4722
  dependencies = [
4665
4723
  "cfg-if",
4666
4724
  "cpufeatures 0.2.17",
4667
- "digest",
4725
+ "digest 0.10.7",
4726
+ ]
4727
+
4728
+ [[package]]
4729
+ name = "sha2"
4730
+ version = "0.11.0"
4731
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4732
+ checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
4733
+ dependencies = [
4734
+ "cfg-if",
4735
+ "cpufeatures 0.3.0",
4736
+ "digest 0.11.2",
4668
4737
  ]
4669
4738
 
4670
4739
  [[package]]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.6.0"
3
+ version = "4.6.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -36,6 +36,7 @@ kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, feat
36
36
  "chunking",
37
37
  "chunking-tokenizers",
38
38
  "embeddings",
39
+ "ort-bundled",
39
40
  "quality",
40
41
  "keywords",
41
42
  "api",
@@ -875,6 +875,17 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
875
875
  config.force_ocr = bool::try_convert(val)?;
876
876
  }
877
877
 
878
+ if let Some(val) = get_kw(ruby, hash, "force_ocr_pages")
879
+ && val.equal(ruby.qnil()).ok() != Some(true)
880
+ {
881
+ let pages_array = magnus::RArray::try_convert(val)?;
882
+ let pages: Vec<usize> = pages_array
883
+ .into_iter()
884
+ .map(|v| usize::try_convert(v))
885
+ .collect::<Result<Vec<_>, _>>()?;
886
+ config.force_ocr_pages = Some(pages);
887
+ }
888
+
878
889
  if let Some(val) = get_kw(ruby, hash, "include_document_structure") {
879
890
  config.include_document_structure = bool::try_convert(val)?;
880
891
  }
@@ -1013,6 +1024,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
1013
1024
  }
1014
1025
  };
1015
1026
  }
1027
+
1028
+ if let Some(val) = get_kw(ruby, hash, "extraction_timeout_secs")
1029
+ && val.equal(ruby.qnil()).ok() != Some(true)
1030
+ {
1031
+ let secs = u64::try_convert(val)?;
1032
+ config.extraction_timeout_secs = Some(secs);
1033
+ }
1016
1034
  }
1017
1035
 
1018
1036
  Ok(config)
@@ -58,6 +58,36 @@ pub fn extract_file(args: &[Value]) -> Result<RHash, Error> {
58
58
  extraction_result_to_ruby(&ruby, result)
59
59
  }
60
60
 
61
+ /// Iterate over PDF pages, yielding (page_index, png_bytes) per page to a Ruby block.
62
+ pub fn render_pdf_pages_iter(path: String, dpi: i32) -> Result<(), Error> {
63
+ let ruby = Ruby::get().expect("Ruby not initialized");
64
+ let dpi_opt = if dpi <= 0 { None } else { Some(dpi) };
65
+
66
+ let iter = kreuzberg::pdf::PdfPageIterator::from_file(&path, dpi_opt, None)
67
+ .map_err(|e| kreuzberg_error(e.into()))?;
68
+
69
+ for result in iter {
70
+ let (page_index, png_bytes) = result.map_err(|e| kreuzberg_error(e.into()))?;
71
+ let rb_index = ruby.integer_from_i64(page_index as i64);
72
+ let rb_bytes = ruby.str_from_slice(&png_bytes);
73
+ let _: magnus::Value = ruby.yield_values((rb_index, rb_bytes))?;
74
+ }
75
+
76
+ Ok(())
77
+ }
78
+
79
+ /// Render a single PDF page to PNG bytes.
80
+ pub fn native_render_pdf_page(path: String, page_index: i64, dpi: i64) -> Result<Vec<u8>, Error> {
81
+ if page_index < 0 {
82
+ return Err(crate::error_handling::runtime_error("page_index must be non-negative"));
83
+ }
84
+ let pdf_bytes = std::fs::read(&path)
85
+ .map_err(|e| crate::error_handling::runtime_error(format!("Failed to read file: {}", e)))?;
86
+ let dpi_opt = if dpi <= 0 { None } else { Some(dpi as i32) };
87
+ kreuzberg::pdf::render_pdf_page_to_png(&pdf_bytes, page_index as usize, dpi_opt, None)
88
+ .map_err(|e| kreuzberg_error(e.into()))
89
+ }
90
+
61
91
  /// Extract content from bytes (asynchronous)
62
92
  pub fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
63
93
  let ruby = Ruby::get().expect("Ruby not initialized");
@@ -23,7 +23,7 @@ pub use gc_guarded_value::GcGuardedValue;
23
23
  pub use helpers::{get_kw, set_hash_entry, json_value_to_ruby, ruby_value_to_json, cache_root_dir, cache_directories};
24
24
  pub use config::parse_extraction_config;
25
25
  pub use result::extraction_result_to_ruby;
26
- pub use extraction::{extract_file_sync, extract_bytes_sync, extract_file, extract_bytes};
26
+ pub use extraction::{extract_file_sync, extract_bytes_sync, extract_file, extract_bytes, render_pdf_pages_iter, native_render_pdf_page};
27
27
  pub use batch::{
28
28
  batch_extract_files_sync, batch_extract_bytes_sync, batch_extract_files, batch_extract_bytes,
29
29
  };
@@ -442,6 +442,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
442
442
  module.define_module_function("batch_extract_files", function!(batch_extract_files, -1))?;
443
443
  module.define_module_function("batch_extract_bytes", function!(batch_extract_bytes, -1))?;
444
444
 
445
+ // PDF page iterator
446
+ module.define_module_function("native_render_pdf_pages_iter", function!(render_pdf_pages_iter, 2))?;
447
+ module.define_module_function("native_render_pdf_page", function!(native_render_pdf_page, 3))?;
448
+
445
449
  // Cache functions
446
450
  module.define_module_function("clear_cache", function!(ruby_clear_cache, 0))?;
447
451
  module.define_module_function("cache_stats", function!(ruby_cache_stats, 0))?;
@@ -926,14 +926,14 @@ module Kreuzberg
926
926
  # )
927
927
  #
928
928
  class Extraction
929
- attr_reader :use_cache, :enable_quality_processing, :force_ocr,
929
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
930
930
  :include_document_structure,
931
931
  :ocr, :chunking, :language_detection, :pdf_options,
932
932
  :images, :postprocessor,
933
933
  :token_reduction, :keywords, :html_options, :pages,
934
934
  :max_concurrent_extractions, :output_format, :result_format,
935
935
  :security_limits, :layout, :concurrency,
936
- :cache_namespace, :cache_ttl_secs
936
+ :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
937
937
 
938
938
  # Alias for backward compatibility - image_extraction is the canonical name
939
939
  alias image_extraction images
@@ -954,11 +954,11 @@ module Kreuzberg
954
954
  #
955
955
  # Keys that are allowed in the Extraction config
956
956
  ALLOWED_KEYS = %i[
957
- use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
957
+ use_cache enable_quality_processing force_ocr force_ocr_pages include_document_structure ocr chunking
958
958
  language_detection pdf_options image_extraction
959
959
  postprocessor token_reduction keywords html_options pages
960
960
  max_concurrent_extractions output_format result_format
961
- security_limits layout concurrency cache_namespace cache_ttl_secs
961
+ security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
962
962
  ].freeze
963
963
 
964
964
  # Aliases for backward compatibility
@@ -1019,6 +1019,7 @@ module Kreuzberg
1019
1019
  use_cache: true,
1020
1020
  enable_quality_processing: true,
1021
1021
  force_ocr: false,
1022
+ force_ocr_pages: nil,
1022
1023
  include_document_structure: false,
1023
1024
  ocr: nil,
1024
1025
  chunking: nil,
@@ -1037,10 +1038,12 @@ module Kreuzberg
1037
1038
  layout: nil,
1038
1039
  concurrency: nil,
1039
1040
  cache_namespace: nil,
1040
- cache_ttl_secs: nil)
1041
+ cache_ttl_secs: nil,
1042
+ extraction_timeout_secs: nil)
1041
1043
  kwargs = {
1042
1044
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1043
- force_ocr: force_ocr, include_document_structure: include_document_structure,
1045
+ force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
1046
+ include_document_structure: include_document_structure,
1044
1047
  ocr: ocr, chunking: chunking, language_detection: language_detection,
1045
1048
  pdf_options: pdf_options, image_extraction: image_extraction,
1046
1049
  postprocessor: postprocessor,
@@ -1050,7 +1053,8 @@ module Kreuzberg
1050
1053
  security_limits: security_limits, layout: layout,
1051
1054
  concurrency: concurrency,
1052
1055
  cache_namespace: cache_namespace,
1053
- cache_ttl_secs: cache_ttl_secs
1056
+ cache_ttl_secs: cache_ttl_secs,
1057
+ extraction_timeout_secs: extraction_timeout_secs
1054
1058
  }
1055
1059
  extracted = extract_from_hash(hash, kwargs)
1056
1060
 
@@ -1068,6 +1072,7 @@ module Kreuzberg
1068
1072
  @use_cache = params[:use_cache] ? true : false
1069
1073
  @enable_quality_processing = params[:enable_quality_processing] ? true : false
1070
1074
  @force_ocr = params[:force_ocr] ? true : false
1075
+ @force_ocr_pages = params[:force_ocr_pages]
1071
1076
  @include_document_structure = params[:include_document_structure] ? true : false
1072
1077
  @ocr = normalize_config(params[:ocr], OCR)
1073
1078
  @chunking = normalize_config(params[:chunking], Chunking)
@@ -1086,6 +1091,7 @@ module Kreuzberg
1086
1091
  @result_format = validate_result_format(params[:result_format])
1087
1092
  @cache_namespace = params[:cache_namespace]
1088
1093
  @cache_ttl_secs = params[:cache_ttl_secs]&.to_i
1094
+ @extraction_timeout_secs = params[:extraction_timeout_secs]&.to_i
1089
1095
  @security_limits = params[:security_limits]
1090
1096
  end
1091
1097
 
@@ -1118,12 +1124,14 @@ module Kreuzberg
1118
1124
  use_cache: @use_cache,
1119
1125
  enable_quality_processing: @enable_quality_processing,
1120
1126
  force_ocr: @force_ocr,
1127
+ force_ocr_pages: @force_ocr_pages,
1121
1128
  include_document_structure: @include_document_structure,
1122
1129
  max_concurrent_extractions: @max_concurrent_extractions,
1123
1130
  output_format: @output_format,
1124
1131
  result_format: @result_format,
1125
1132
  cache_namespace: @cache_namespace,
1126
- cache_ttl_secs: @cache_ttl_secs
1133
+ cache_ttl_secs: @cache_ttl_secs,
1134
+ extraction_timeout_secs: @extraction_timeout_secs
1127
1135
  }
1128
1136
  end
1129
1137
 
@@ -1250,6 +1258,8 @@ module Kreuzberg
1250
1258
  @enable_quality_processing = value ? true : false
1251
1259
  when :force_ocr
1252
1260
  @force_ocr = value ? true : false
1261
+ when :force_ocr_pages
1262
+ @force_ocr_pages = value
1253
1263
  when :include_document_structure
1254
1264
  @include_document_structure = value ? true : false
1255
1265
  when :ocr
@@ -1286,6 +1296,8 @@ module Kreuzberg
1286
1296
  @cache_namespace = value
1287
1297
  when :cache_ttl_secs
1288
1298
  @cache_ttl_secs = value&.to_i
1299
+ when :extraction_timeout_secs
1300
+ @extraction_timeout_secs = value&.to_i
1289
1301
  else
1290
1302
  raise ArgumentError, "Unknown configuration key: #{key}"
1291
1303
  end
@@ -1345,6 +1357,7 @@ module Kreuzberg
1345
1357
  @use_cache = merged.use_cache
1346
1358
  @enable_quality_processing = merged.enable_quality_processing
1347
1359
  @force_ocr = merged.force_ocr
1360
+ @force_ocr_pages = merged.force_ocr_pages
1348
1361
  @include_document_structure = merged.include_document_structure
1349
1362
  @ocr = merged.ocr
1350
1363
  @chunking = merged.chunking
@@ -1369,6 +1382,7 @@ module Kreuzberg
1369
1382
  @result_format = merged.result_format
1370
1383
  @cache_namespace = merged.cache_namespace
1371
1384
  @cache_ttl_secs = merged.cache_ttl_secs
1385
+ @extraction_timeout_secs = merged.extraction_timeout_secs
1372
1386
  end
1373
1387
  end
1374
1388
  end
@@ -319,6 +319,43 @@ module Kreuzberg
319
319
  results
320
320
  end
321
321
 
322
+ # Render a single PDF page as a PNG image.
323
+ #
324
+ # @param path [String, Pathname] Path to the PDF file
325
+ # @param page_index [Integer] Zero-based page index
326
+ # @param dpi [Integer] Rendering resolution (default 150)
327
+ # @return [String] PNG-encoded binary string
328
+ # @raise [Errors::IOError] If the file cannot be read
329
+ # @raise [Errors::ParsingError] If rendering fails
330
+ def render_pdf_page(path, page_index, dpi: 150)
331
+ path_str = path.to_s
332
+ raise ArgumentError, 'page_index must be non-negative' if page_index.negative?
333
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
334
+
335
+ native_render_pdf_page(path_str, page_index, dpi)
336
+ end
337
+
338
+ # Iterate over pages of a PDF lazily, yielding each page as it is rendered.
339
+ #
340
+ # Each page is rendered via the native FFI iterator, so only one page is in
341
+ # memory at a time.
342
+ #
343
+ # @param path [String, Pathname] Path to the PDF file
344
+ # @param dpi [Integer] Rendering resolution (default 150)
345
+ # @yieldparam page_index [Integer] Zero-based page index
346
+ # @yieldparam png_bytes [String] PNG-encoded binary string for the page
347
+ # @return [Enumerator] if no block is given
348
+ # @raise [Errors::IOError] If the file cannot be read
349
+ # @raise [Errors::ParsingError] If rendering fails
350
+ def render_pdf_pages_iter(path, dpi: 150, &block)
351
+ path_str = path.to_s
352
+ raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
353
+
354
+ return enum_for(:render_pdf_pages_iter, path, dpi: dpi) unless block
355
+
356
+ native_render_pdf_pages_iter(path_str, dpi, &block)
357
+ end
358
+
322
359
  def normalize_config(config)
323
360
  return {} if config.nil?
324
361
  return config if config.is_a?(Hash)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.6.0'
4
+ VERSION = '4.6.2'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -481,7 +481,9 @@ module Kreuzberg
481
481
  attr_reader enable_quality_processing: bool
482
482
  attr_reader cache_namespace: String?
483
483
  attr_reader cache_ttl_secs: Integer?
484
+ attr_reader extraction_timeout_secs: Integer?
484
485
  attr_reader force_ocr: bool
486
+ attr_reader force_ocr_pages: Array[Integer]?
485
487
  attr_reader include_document_structure: bool
486
488
  attr_reader ocr: OCR?
487
489
  attr_reader chunking: Chunking?
@@ -508,6 +510,7 @@ module Kreuzberg
508
510
  ?use_cache: bool,
509
511
  ?enable_quality_processing: bool,
510
512
  ?force_ocr: bool,
513
+ ?force_ocr_pages: Array[Integer]?,
511
514
  ?include_document_structure: bool,
512
515
  ?ocr: (OCR | Hash[Symbol, untyped])?,
513
516
  ?chunking: (Chunking | Hash[Symbol, untyped])?,
@@ -525,7 +528,8 @@ module Kreuzberg
525
528
  ?output_format: String?,
526
529
  ?result_format: String?,
527
530
  ?cache_namespace: String?,
528
- ?cache_ttl_secs: Integer?
531
+ ?cache_ttl_secs: Integer?,
532
+ ?extraction_timeout_secs: Integer?
529
533
  ) -> void
530
534
  def to_h: () -> Hash[Symbol, untyped]
531
535
  def to_json: (*untyped) -> String
@@ -0,0 +1,91 @@
1
+ # Hand-written binding-specific edge case tests for PDF rendering.
2
+ # Happy-path render tests are auto-generated from fixtures in e2e/.
3
+ # These tests cover error handling, validation, and lifecycle patterns
4
+ # that vary per language and can't be generated uniformly.
5
+
6
+ # frozen_string_literal: true
7
+
8
+ require 'spec_helper'
9
+
10
+ RSpec.describe 'PDF Rendering' do
11
+ it 'exposes rendering methods' do
12
+ expect(Kreuzberg).to respond_to(:render_pdf_page)
13
+ expect(Kreuzberg).to respond_to(:render_pdf_pages_iter)
14
+ end
15
+
16
+ describe '.render_pdf_page' do
17
+ it 'raises an error for a nonexistent file' do
18
+ expect do
19
+ Kreuzberg.render_pdf_page('/nonexistent/path/to/document.pdf', 0)
20
+ end.to raise_error(Kreuzberg::Errors::IOError)
21
+ end
22
+
23
+ it 'raises an error for an out-of-bounds page index' do
24
+ pdf_path = test_document_path('pdf/tiny.pdf')
25
+ skip 'Test PDF not available' unless File.exist?(pdf_path)
26
+
27
+ expect do
28
+ Kreuzberg.render_pdf_page(pdf_path, 9999)
29
+ end.to raise_error(StandardError)
30
+ end
31
+ end
32
+
33
+ describe '.render_pdf_page with negative index' do
34
+ it 'raises ArgumentError for a negative page index' do
35
+ pdf_path = test_document_path('pdf/tiny.pdf')
36
+ skip 'Test PDF not available' unless File.exist?(pdf_path)
37
+
38
+ expect do
39
+ Kreuzberg.render_pdf_page(pdf_path, -1)
40
+ end.to raise_error(ArgumentError)
41
+ end
42
+ end
43
+
44
+ describe '.render_pdf_pages_iter' do
45
+ it 'raises an error for a nonexistent file' do
46
+ expect do
47
+ Kreuzberg.render_pdf_pages_iter('/nonexistent/path/to/document.pdf') { |_, _| nil }
48
+ end.to raise_error(Kreuzberg::Errors::IOError)
49
+ end
50
+ end
51
+
52
+ describe '.render_pdf_page with empty path' do
53
+ it 'raises an error for an empty path' do
54
+ expect do
55
+ Kreuzberg.render_pdf_page('', 0)
56
+ end.to raise_error(StandardError)
57
+ end
58
+ end
59
+
60
+ describe '.render_pdf_pages_iter cleanup' do
61
+ it 'handles iterator cleanup without fully consuming' do
62
+ pdf_path = test_document_path('pdf/tiny.pdf')
63
+ skip 'Test PDF not available' unless File.exist?(pdf_path)
64
+
65
+ # Iterate but stop immediately — no crash
66
+ Kreuzberg.render_pdf_pages_iter(pdf_path) do |_page_index, _png_data|
67
+ break
68
+ end
69
+ end
70
+ end
71
+
72
+ describe '.render_pdf_pages_iter early termination' do
73
+ it 'returns valid PNG for the first page then stops' do
74
+ pdf_path = test_document_path('pdf/tiny.pdf')
75
+ skip 'Test PDF not available' unless File.exist?(pdf_path)
76
+
77
+ first_png = nil
78
+ Kreuzberg.render_pdf_pages_iter(pdf_path) do |page_index, png_data|
79
+ expect(page_index).to eq(0)
80
+ expect(png_data).to be_a(String)
81
+ expect(png_data.bytesize).to be > 8
82
+ # PNG magic bytes
83
+ expect(png_data.bytes[0..3]).to eq([0x89, 0x50, 0x4E, 0x47])
84
+ first_png = png_data
85
+ break
86
+ end
87
+
88
+ expect(first_png).not_to be_nil
89
+ end
90
+ end
91
+ end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.6.0"
5
+ version = "4.6.2"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
30
30
  image = { version = "0.25.10", default-features = false }
31
31
  itertools = "0.14"
32
32
  js-sys = "0.3"
33
- kreuzberg = { path = "./crates/kreuzberg", version = "4.6.0", default-features = false }
34
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.0" }
33
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.6.2", default-features = false }
34
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.2" }
35
35
  lazy_static = "1.5.0"
36
36
  libc = "0.2.183"
37
37
  log = "0.4"