kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea76ed9d63fda80f47f1054c421989e77269b2d3bc8810ab25cd7e59b062ec7d
4
- data.tar.gz: de96aea5d18ed67f34117fca308c5cc30b9e719e60c2a8fb0384f50d1fbd704f
3
+ metadata.gz: 1b0f873b7da0609856d3f396a8d43cc30bcd62f7919fa0ec7572f32f990f99f6
4
+ data.tar.gz: ea902df98f54a593113a999dc08c7497d38008d0df3881a677249453b4ca3886
5
5
  SHA512:
6
- metadata.gz: 96fec6456241cf9a03ab60ea19ccf1ca8beda412ade51e298bc13480374a5ff7565fda57b2fb8c73edebc54ab3641d4832d740fc68a0e4bba2d610e9b340f682
7
- data.tar.gz: 254f57261deda88616238abeb5faffd7ebdc767fb01cdc712193edbabd6a5329aeadc55dc0d93c1e5eda491924662b947a737e40eb49e3a9d9b764239e4c1221
6
+ metadata.gz: 197bb0ad826ab4362efcff8dc5fded982360d9cb252d5150c786a762d28d3bf98c7d72b6e9334dd40ae1e65712a54c0261dbd71a037f098cafb599642269dae3
7
+ data.tar.gz: e206c9553e656a00ee7722e7f570475037689e874540b37fe5d9c6353e64ba0de05c82893b5d2ae057d68d3320d3e59c624553b8fff08f2ae3c896506c6c275b
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.11)
4
+ kreuzberg (4.0.0.pre.rc.14)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/Rakefile CHANGED
@@ -6,6 +6,13 @@ require 'rspec/core/rake_task'
6
6
 
7
7
  GEMSPEC = Gem::Specification.load(File.expand_path('kreuzberg.gemspec', __dir__))
8
8
 
9
+ # Vendor kreuzberg core crates before compilation
10
+ task :vendor do
11
+ vendor_script = File.expand_path('../../scripts/ci/ruby/vendor-kreuzberg-core.sh', __dir__)
12
+ puts 'Vendoring kreuzberg core crates...'
13
+ sh "bash #{vendor_script}"
14
+ end
15
+
9
16
  Rake::ExtensionTask.new('kreuzberg_rb', GEMSPEC) do |ext|
10
17
  ext.lib_dir = 'lib'
11
18
  ext.ext_dir = 'ext/kreuzberg_rb'
@@ -21,5 +28,6 @@ end
21
28
 
22
29
  RSpec::Core::RakeTask.new(:spec)
23
30
 
31
+ task compile: :vendor
24
32
  task spec: :compile
25
33
  task default: :spec
@@ -677,14 +677,14 @@ dependencies = [
677
677
  "serde_json",
678
678
  "syn",
679
679
  "tempfile",
680
- "toml 0.9.8",
680
+ "toml 0.9.10+spec-1.1.0",
681
681
  ]
682
682
 
683
683
  [[package]]
684
684
  name = "cc"
685
- version = "1.2.49"
685
+ version = "1.2.50"
686
686
  source = "registry+https://github.com/rust-lang/crates.io-index"
687
- checksum = "90583009037521a116abf44494efecd645ba48b6622457080f080b85544e2215"
687
+ checksum = "9f50d563227a1c37cc0a263f64eca3334388c01c5e4c4861a9def205c614383c"
688
688
  dependencies = [
689
689
  "find-msvc-tools",
690
690
  "jobserver",
@@ -809,9 +809,9 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
809
809
 
810
810
  [[package]]
811
811
  name = "cmake"
812
- version = "0.1.56"
812
+ version = "0.1.57"
813
813
  source = "registry+https://github.com/rust-lang/crates.io-index"
814
- checksum = "b042e5d8a74ae91bb0961acd039822472ec99f8ab0948cbf6d1369588f8be586"
814
+ checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d"
815
815
  dependencies = [
816
816
  "cc",
817
817
  ]
@@ -1038,12 +1038,12 @@ dependencies = [
1038
1038
 
1039
1039
  [[package]]
1040
1040
  name = "darling"
1041
- version = "0.21.3"
1041
+ version = "0.23.0"
1042
1042
  source = "registry+https://github.com/rust-lang/crates.io-index"
1043
- checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0"
1043
+ checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d"
1044
1044
  dependencies = [
1045
- "darling_core 0.21.3",
1046
- "darling_macro 0.21.3",
1045
+ "darling_core 0.23.0",
1046
+ "darling_macro 0.23.0",
1047
1047
  ]
1048
1048
 
1049
1049
  [[package]]
@@ -1062,11 +1062,10 @@ dependencies = [
1062
1062
 
1063
1063
  [[package]]
1064
1064
  name = "darling_core"
1065
- version = "0.21.3"
1065
+ version = "0.23.0"
1066
1066
  source = "registry+https://github.com/rust-lang/crates.io-index"
1067
- checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4"
1067
+ checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0"
1068
1068
  dependencies = [
1069
- "fnv",
1070
1069
  "ident_case",
1071
1070
  "proc-macro2",
1072
1071
  "quote",
@@ -1087,11 +1086,11 @@ dependencies = [
1087
1086
 
1088
1087
  [[package]]
1089
1088
  name = "darling_macro"
1090
- version = "0.21.3"
1089
+ version = "0.23.0"
1091
1090
  source = "registry+https://github.com/rust-lang/crates.io-index"
1092
- checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81"
1091
+ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
1093
1092
  dependencies = [
1094
- "darling_core 0.21.3",
1093
+ "darling_core 0.23.0",
1095
1094
  "quote",
1096
1095
  "syn",
1097
1096
  ]
@@ -1427,9 +1426,9 @@ dependencies = [
1427
1426
 
1428
1427
  [[package]]
1429
1428
  name = "fastembed"
1430
- version = "5.4.0"
1429
+ version = "5.5.0"
1431
1430
  source = "registry+https://github.com/rust-lang/crates.io-index"
1432
- checksum = "b0d719825156b62586040fd0e5653a4f7bc0ad9caf6c7ec38cb18f1a08ee0384"
1431
+ checksum = "de72c516a1484c70ba0d98597dafc6274484b542c9ee54e7a326160baa013849"
1433
1432
  dependencies = [
1434
1433
  "anyhow",
1435
1434
  "hf-hub",
@@ -1873,9 +1872,9 @@ dependencies = [
1873
1872
 
1874
1873
  [[package]]
1875
1874
  name = "html-to-markdown-rs"
1876
- version = "2.14.11"
1875
+ version = "2.15.0"
1877
1876
  source = "registry+https://github.com/rust-lang/crates.io-index"
1878
- checksum = "5d0480c8a1832b39cbc27cf09283c344d82530b86e5f7a9db417dc42da69395f"
1877
+ checksum = "7741e7928e84f3f3497c84b8dd27e9fcc3368bd133e44ca800715eb34a1d58c8"
1879
1878
  dependencies = [
1880
1879
  "astral-tl",
1881
1880
  "base64 0.22.1",
@@ -2415,7 +2414,7 @@ dependencies = [
2415
2414
 
2416
2415
  [[package]]
2417
2416
  name = "kreuzberg"
2418
- version = "4.0.0-rc.10"
2417
+ version = "4.0.0-rc.13"
2419
2418
  dependencies = [
2420
2419
  "ahash",
2421
2420
  "async-trait",
@@ -2450,7 +2449,7 @@ dependencies = [
2450
2449
  "opentelemetry",
2451
2450
  "opentelemetry_sdk",
2452
2451
  "org",
2453
- "paste",
2452
+ "pastey 0.2.1",
2454
2453
  "pdfium-render",
2455
2454
  "pkg-config",
2456
2455
  "polars",
@@ -2474,7 +2473,7 @@ dependencies = [
2474
2473
  "thiserror 2.0.17",
2475
2474
  "tiff",
2476
2475
  "tokio",
2477
- "toml 0.9.8",
2476
+ "toml 0.9.10+spec-1.1.0",
2478
2477
  "tower",
2479
2478
  "tower-http",
2480
2479
  "tracing",
@@ -2489,7 +2488,7 @@ dependencies = [
2489
2488
 
2490
2489
  [[package]]
2491
2490
  name = "kreuzberg-ffi"
2492
- version = "4.0.0-rc.10"
2491
+ version = "4.0.0-rc.13"
2493
2492
  dependencies = [
2494
2493
  "async-trait",
2495
2494
  "cbindgen",
@@ -2502,7 +2501,7 @@ dependencies = [
2502
2501
 
2503
2502
  [[package]]
2504
2503
  name = "kreuzberg-rb"
2505
- version = "4.0.0-rc.10"
2504
+ version = "4.0.0-rc.14"
2506
2505
  dependencies = [
2507
2506
  "async-trait",
2508
2507
  "html-to-markdown-rs",
@@ -2517,7 +2516,7 @@ dependencies = [
2517
2516
 
2518
2517
  [[package]]
2519
2518
  name = "kreuzberg-tesseract"
2520
- version = "4.0.0-rc.10"
2519
+ version = "4.0.0-rc.13"
2521
2520
  dependencies = [
2522
2521
  "cc",
2523
2522
  "cmake",
@@ -4633,9 +4632,9 @@ dependencies = [
4633
4632
 
4634
4633
  [[package]]
4635
4634
  name = "rmcp"
4636
- version = "0.11.0"
4635
+ version = "0.12.0"
4637
4636
  source = "registry+https://github.com/rust-lang/crates.io-index"
4638
- checksum = "5df440eaa43f8573491ed4a5899719b6d29099500774abba12214a095a4083ed"
4637
+ checksum = "528d42f8176e6e5e71ea69182b17d1d0a19a6b3b894b564678b74cd7cab13cfa"
4639
4638
  dependencies = [
4640
4639
  "async-trait",
4641
4640
  "axum",
@@ -4665,11 +4664,11 @@ dependencies = [
4665
4664
 
4666
4665
  [[package]]
4667
4666
  name = "rmcp-macros"
4668
- version = "0.11.0"
4667
+ version = "0.12.0"
4669
4668
  source = "registry+https://github.com/rust-lang/crates.io-index"
4670
- checksum = "9ef03779cccab8337dd8617c53fce5c98ec21794febc397531555472ca28f8c3"
4669
+ checksum = "e3f81daaa494eb8e985c9462f7d6ce1ab05e5299f48aafd76cdd3d8b060e6f59"
4671
4670
  dependencies = [
4672
- "darling 0.21.3",
4671
+ "darling 0.23.0",
4673
4672
  "proc-macro2",
4674
4673
  "quote",
4675
4674
  "serde_json",
@@ -4999,9 +4998,9 @@ dependencies = [
4999
4998
 
5000
4999
  [[package]]
5001
5000
  name = "serde_spanned"
5002
- version = "1.0.3"
5001
+ version = "1.0.4"
5003
5002
  source = "registry+https://github.com/rust-lang/crates.io-index"
5004
- checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392"
5003
+ checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776"
5005
5004
  dependencies = [
5006
5005
  "serde_core",
5007
5006
  ]
@@ -5678,14 +5677,14 @@ dependencies = [
5678
5677
 
5679
5678
  [[package]]
5680
5679
  name = "toml"
5681
- version = "0.9.8"
5680
+ version = "0.9.10+spec-1.1.0"
5682
5681
  source = "registry+https://github.com/rust-lang/crates.io-index"
5683
- checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8"
5682
+ checksum = "0825052159284a1a8b4d6c0c86cbc801f2da5afd2b225fa548c72f2e74002f48"
5684
5683
  dependencies = [
5685
5684
  "indexmap",
5686
5685
  "serde_core",
5687
- "serde_spanned 1.0.3",
5688
- "toml_datetime 0.7.3",
5686
+ "serde_spanned 1.0.4",
5687
+ "toml_datetime 0.7.5+spec-1.1.0",
5689
5688
  "toml_parser",
5690
5689
  "toml_writer",
5691
5690
  "winnow",
@@ -5702,9 +5701,9 @@ dependencies = [
5702
5701
 
5703
5702
  [[package]]
5704
5703
  name = "toml_datetime"
5705
- version = "0.7.3"
5704
+ version = "0.7.5+spec-1.1.0"
5706
5705
  source = "registry+https://github.com/rust-lang/crates.io-index"
5707
- checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533"
5706
+ checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347"
5708
5707
  dependencies = [
5709
5708
  "serde_core",
5710
5709
  ]
@@ -5725,9 +5724,9 @@ dependencies = [
5725
5724
 
5726
5725
  [[package]]
5727
5726
  name = "toml_parser"
5728
- version = "1.0.4"
5727
+ version = "1.0.6+spec-1.1.0"
5729
5728
  source = "registry+https://github.com/rust-lang/crates.io-index"
5730
- checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e"
5729
+ checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44"
5731
5730
  dependencies = [
5732
5731
  "winnow",
5733
5732
  ]
@@ -5740,9 +5739,9 @@ checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
5740
5739
 
5741
5740
  [[package]]
5742
5741
  name = "toml_writer"
5743
- version = "1.0.4"
5742
+ version = "1.0.6+spec-1.1.0"
5744
5743
  source = "registry+https://github.com/rust-lang/crates.io-index"
5745
- checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2"
5744
+ checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
5746
5745
 
5747
5746
  [[package]]
5748
5747
  name = "tower"
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
7
7
 
8
8
  [package]
9
9
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.11"
10
+ version = "4.0.0-rc.14"
11
11
  edition = "2024"
12
12
  rust-version = "1.91"
13
13
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -59,7 +59,7 @@ use std::ffi::c_char;
59
59
  // These C ABI functions are provided by the kreuzberg-ffi crate
60
60
  // We declare them here to ensure proper linking on all platforms
61
61
  #[link(name = "kreuzberg_ffi", kind = "static")]
62
- extern "C" {
62
+ unsafe extern "C" {
63
63
  pub fn kreuzberg_last_error_code() -> i32;
64
64
  pub fn kreuzberg_last_panic_context() -> *mut c_char;
65
65
  pub fn kreuzberg_free_string(s: *mut c_char);
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.11'
4
+ VERSION = '4.0.0-rc.14'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.0.0-rc.11"
5
+ version = "4.0.0-rc.14"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -33,6 +33,7 @@ tracing = "0.1"
33
33
  ahash = "0.8.12"
34
34
  base64 = "0.22.1"
35
35
  hex = "0.4.3"
36
+ toml = "0.9.10"
36
37
  num_cpus = "1.17.0"
37
38
  once_cell = "1.21.3"
38
39
  html-to-markdown-rs = { version = "2.14.11", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.0.0-rc.11"
3
+ version = "4.0.0-rc.14"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -52,7 +52,7 @@ office = [
52
52
  email = ["dep:mail-parser", "dep:msg_parser"]
53
53
  html = ["dep:html-to-markdown-rs"]
54
54
  xml = ["dep:quick-xml", "dep:roxmltree"]
55
- archives = ["dep:zip", "dep:tar", "dep:sevenz-rust"]
55
+ archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2"]
56
56
 
57
57
  # Processing features
58
58
  ocr = [
@@ -145,7 +145,7 @@ regex = "1.12.2"
145
145
  serde = { version = "1.0.228", features = ["derive"] }
146
146
  serde_json = "1.0.145"
147
147
  serde_yaml_ng = "0.10.0"
148
- toml = { workspace = true }
148
+ toml = "0.9.10"
149
149
  mime_guess = "2.0"
150
150
  rmp-serde = "1.3"
151
151
  thiserror = "2.0.17"
@@ -163,13 +163,13 @@ lopdf = { version = "0.38.0", optional = true }
163
163
  calamine = { version = "0.32.0", features = ["dates"], optional = true }
164
164
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
165
165
  roxmltree = { version = "0.21.1", optional = true }
166
- zip = { version = "6.0.0", optional = true }
166
+ zip = { version = "7.0.0", optional = true }
167
167
  mail-parser = { version = "0.11.1", optional = true }
168
168
  msg_parser = { version = "0.1.1", optional = true }
169
169
  html-to-markdown-rs = { version = "2.14.11", default-features = false, features = ["inline-images"], optional = true }
170
170
  quick-xml = { version = "0.38.4", features = ["serialize"], optional = true }
171
171
  tar = { version = "0.4.44", optional = true }
172
- sevenz-rust = { version = "0.6.1", optional = true }
172
+ sevenz-rust2 = { version = "0.20.0", optional = true }
173
173
  docx-lite = { version = "0.2.0", optional = true }
174
174
 
175
175
  pulldown-cmark = { version = "0.13", optional = true }
@@ -228,7 +228,7 @@ infer = "0.19.0"
228
228
  tempfile = "3.23.0"
229
229
  filetime = "0.2"
230
230
  tar = "0.4.44"
231
- zip = "6.0.0"
231
+ zip = "7.0.0"
232
232
  serial_test = "3.2.0"
233
233
  anyhow = "1.0"
234
234
  tokio-test = "0.4"
@@ -3,7 +3,7 @@
3
3
  //! This module provides functions for extracting file lists and contents from archives.
4
4
 
5
5
  use crate::error::{KreuzbergError, Result};
6
- use sevenz_rust::SevenZReader;
6
+ use sevenz_rust2::{ArchiveReader, Password};
7
7
  use std::collections::HashMap;
8
8
  use std::io::{Cursor, Read};
9
9
  use tar::Archive as TarArchive;
@@ -179,7 +179,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
179
179
  /// Extract metadata from a 7z archive.
180
180
  pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
181
181
  let cursor = Cursor::new(bytes);
182
- let archive = SevenZReader::new(cursor, bytes.len() as u64, "".into())
182
+ let archive = ArchiveReader::new(cursor, Password::empty())
183
183
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
184
184
 
185
185
  let mut file_list = Vec::new();
@@ -212,7 +212,7 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
212
212
  /// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
213
213
  pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
214
214
  let cursor = Cursor::new(bytes);
215
- let mut archive = SevenZReader::new(cursor, bytes.len() as u64, "".into())
215
+ let mut archive = ArchiveReader::new(cursor, Password::empty())
216
216
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
217
217
 
218
218
  let mut contents = HashMap::new();
@@ -459,26 +459,26 @@ mod tests {
459
459
 
460
460
  #[test]
461
461
  fn test_extract_7z_metadata_with_files() {
462
- use sevenz_rust::SevenZWriter;
462
+ use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
463
463
 
464
- let mut cursor = Cursor::new(Vec::new());
465
- {
466
- let mut sz = SevenZWriter::new(&mut cursor).unwrap();
464
+ let cursor = {
465
+ let cursor = Cursor::new(Vec::new());
466
+ let mut sz = ArchiveWriter::new(cursor).unwrap();
467
467
 
468
468
  sz.push_archive_entry(
469
- sevenz_rust::SevenZArchiveEntry::from_path("test.txt", "test.txt".to_string()),
469
+ ArchiveEntry::new_file("test.txt"),
470
470
  Some(Cursor::new(b"Hello 7z!".to_vec())),
471
471
  )
472
472
  .unwrap();
473
473
 
474
474
  sz.push_archive_entry(
475
- sevenz_rust::SevenZArchiveEntry::from_path("data.json", "data.json".to_string()),
475
+ ArchiveEntry::new_file("data.json"),
476
476
  Some(Cursor::new(b"{\"key\":\"value\"}".to_vec())),
477
477
  )
478
478
  .unwrap();
479
479
 
480
- sz.finish().unwrap();
481
- }
480
+ sz.finish().unwrap()
481
+ };
482
482
 
483
483
  let bytes = cursor.into_inner();
484
484
  let metadata = extract_7z_metadata(&bytes).unwrap();
@@ -834,26 +834,26 @@ mod tests {
834
834
 
835
835
  #[test]
836
836
  fn test_extract_7z_text_content() {
837
- use sevenz_rust::SevenZWriter;
837
+ use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
838
838
 
839
- let mut cursor = Cursor::new(Vec::new());
840
- {
841
- let mut sz = SevenZWriter::new(&mut cursor).unwrap();
839
+ let cursor = {
840
+ let cursor = Cursor::new(Vec::new());
841
+ let mut sz = ArchiveWriter::new(cursor).unwrap();
842
842
 
843
843
  sz.push_archive_entry(
844
- sevenz_rust::SevenZArchiveEntry::from_path("test.txt", "test.txt".to_string()),
844
+ ArchiveEntry::new_file("test.txt"),
845
845
  Some(Cursor::new(b"Hello 7z text!".to_vec())),
846
846
  )
847
847
  .unwrap();
848
848
 
849
849
  sz.push_archive_entry(
850
- sevenz_rust::SevenZArchiveEntry::from_path("readme.md", "readme.md".to_string()),
850
+ ArchiveEntry::new_file("readme.md"),
851
851
  Some(Cursor::new(b"# 7z README".to_vec())),
852
852
  )
853
853
  .unwrap();
854
854
 
855
- sz.finish().unwrap();
856
- }
855
+ sz.finish().unwrap()
856
+ };
857
857
 
858
858
  let bytes = cursor.into_inner();
859
859
  let contents = extract_7z_text_content(&bytes).unwrap();
@@ -865,13 +865,13 @@ mod tests {
865
865
 
866
866
  #[test]
867
867
  fn test_extract_7z_empty_archive() {
868
- use sevenz_rust::SevenZWriter;
868
+ use sevenz_rust2::ArchiveWriter;
869
869
 
870
- let mut cursor = Cursor::new(Vec::new());
871
- {
872
- let sz = SevenZWriter::new(&mut cursor).unwrap();
873
- sz.finish().unwrap();
874
- }
870
+ let cursor = {
871
+ let cursor = Cursor::new(Vec::new());
872
+ let sz = ArchiveWriter::new(cursor).unwrap();
873
+ sz.finish().unwrap()
874
+ };
875
875
 
876
876
  let bytes = cursor.into_inner();
877
877
  let metadata = extract_7z_metadata(&bytes).unwrap();
@@ -361,6 +361,7 @@ mod tests {
361
361
  use super::*;
362
362
 
363
363
  #[tokio::test]
364
+ #[cfg(not(target_os = "windows"))]
364
365
  async fn test_check_libreoffice_available() {
365
366
  let result = check_libreoffice_available().await;
366
367
  if result.is_err() {
@@ -370,6 +371,7 @@ mod tests {
370
371
  }
371
372
 
372
373
  #[tokio::test]
374
+ #[cfg(not(target_os = "windows"))]
373
375
  async fn test_convert_office_doc_missing_file() {
374
376
  if check_libreoffice_available().await.is_err() {
375
377
  return;
@@ -391,6 +393,7 @@ mod tests {
391
393
  }
392
394
 
393
395
  #[tokio::test]
396
+ #[cfg(not(target_os = "windows"))]
394
397
  async fn test_convert_doc_to_docx_empty_bytes() {
395
398
  if check_libreoffice_available().await.is_err() {
396
399
  return;
@@ -403,6 +406,7 @@ mod tests {
403
406
  }
404
407
 
405
408
  #[tokio::test]
409
+ #[cfg(not(target_os = "windows"))]
406
410
  async fn test_convert_ppt_to_pptx_empty_bytes() {
407
411
  if check_libreoffice_available().await.is_err() {
408
412
  return;
@@ -415,6 +419,7 @@ mod tests {
415
419
  }
416
420
 
417
421
  #[tokio::test]
422
+ #[cfg(not(target_os = "windows"))]
418
423
  async fn test_convert_doc_to_docx_invalid_doc() {
419
424
  if check_libreoffice_available().await.is_err() {
420
425
  return;
@@ -427,6 +432,7 @@ mod tests {
427
432
  }
428
433
 
429
434
  #[tokio::test]
435
+ #[cfg(not(target_os = "windows"))]
430
436
  async fn test_convert_ppt_to_pptx_invalid_ppt() {
431
437
  if check_libreoffice_available().await.is_err() {
432
438
  return;
@@ -439,6 +445,7 @@ mod tests {
439
445
  }
440
446
 
441
447
  #[tokio::test]
448
+ #[cfg(not(target_os = "windows"))]
442
449
  async fn test_convert_office_doc_invalid_target_format() {
443
450
  if check_libreoffice_available().await.is_err() {
444
451
  return;
@@ -459,6 +466,7 @@ mod tests {
459
466
  }
460
467
 
461
468
  #[tokio::test]
469
+ #[cfg(not(target_os = "windows"))]
462
470
  async fn test_check_libreoffice_missing_dependency_error() {
463
471
  let result = check_libreoffice_available().await;
464
472
 
@@ -473,6 +481,7 @@ mod tests {
473
481
  }
474
482
 
475
483
  #[tokio::test]
484
+ #[cfg(not(target_os = "windows"))]
476
485
  async fn test_convert_office_doc_creates_output_dir() {
477
486
  if check_libreoffice_available().await.is_err() {
478
487
  return;
@@ -507,6 +516,7 @@ mod tests {
507
516
  }
508
517
 
509
518
  #[tokio::test]
519
+ #[cfg(not(target_os = "windows"))]
510
520
  async fn test_convert_doc_to_docx_temp_cleanup() {
511
521
  if check_libreoffice_available().await.is_err() {
512
522
  return;
@@ -517,6 +527,7 @@ mod tests {
517
527
  }
518
528
 
519
529
  #[tokio::test]
530
+ #[cfg(not(target_os = "windows"))]
520
531
  async fn test_convert_ppt_to_pptx_temp_cleanup() {
521
532
  if check_libreoffice_available().await.is_err() {
522
533
  return;
@@ -527,6 +538,7 @@ mod tests {
527
538
  }
528
539
 
529
540
  #[tokio::test]
541
+ #[cfg(not(target_os = "windows"))]
530
542
  async fn test_convert_office_doc_timeout_kills_process() {
531
543
  if check_libreoffice_available().await.is_err() {
532
544
  return;
@@ -5,6 +5,7 @@ use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
6
  use crate::types::{ExtractionResult, Metadata, PageContent};
7
7
  use async_trait::async_trait;
8
+ #[cfg(feature = "tokio-runtime")]
8
9
  use std::path::Path;
9
10
 
10
11
  #[cfg(feature = "pdf")]
@@ -370,7 +371,7 @@ impl DocumentExtractor for PdfExtractor {
370
371
 
371
372
  (pdf_metadata, native_text, tables, page_contents)
372
373
  }
373
- #[cfg(not(target_arch = "wasm32"))]
374
+ #[cfg(all(not(target_arch = "wasm32"), feature = "tokio-runtime"))]
374
375
  {
375
376
  if crate::core::batch_mode::is_batch_mode() {
376
377
  let content_owned = content.to_vec();
@@ -441,6 +442,32 @@ impl DocumentExtractor for PdfExtractor {
441
442
  (pdf_metadata, native_text, tables, page_contents)
442
443
  }
443
444
  }
445
+ #[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
446
+ {
447
+ let bindings =
448
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
449
+
450
+ let pdfium = Pdfium::new(bindings);
451
+
452
+ let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
453
+ let err_msg = e.to_string();
454
+ if err_msg.contains("password") || err_msg.contains("Password") {
455
+ PdfError::PasswordRequired
456
+ } else {
457
+ PdfError::InvalidPdf(err_msg)
458
+ }
459
+ })?;
460
+
461
+ let (native_text, boundaries, page_contents) =
462
+ crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
463
+
464
+ let pdf_metadata =
465
+ crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
466
+
467
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
468
+
469
+ (pdf_metadata, native_text, tables, page_contents)
470
+ }
444
471
  };
445
472
 
446
473
  #[cfg(feature = "ocr")]
@@ -5,7 +5,7 @@ pub(crate) fn bind_pdfium(
5
5
  map_err: fn(String) -> PdfError,
6
6
  context: &'static str,
7
7
  ) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
8
- #[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
8
+ #[cfg(all(feature = "pdf", feature = "bundled-pdfium"))]
9
9
  {
10
10
  // WASM target: use dynamic binding to WASM module
11
11
  // SAFETY: pdfium-render handles WASM module lifecycle internally.
@@ -36,7 +36,7 @@ pub(crate) fn bind_pdfium(
36
36
  }
37
37
  }
38
38
 
39
- #[cfg(all(feature = "pdf", not(feature = "pdf-bundled")))]
39
+ #[cfg(all(feature = "pdf", not(feature = "bundled-pdfium")))]
40
40
  {
41
41
  Pdfium::bind_to_system_library()
42
42
  .map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
@@ -1,6 +1,6 @@
1
1
  //! Runtime extraction of bundled PDFium library.
2
2
  //!
3
- //! When the `pdf-bundled` feature is enabled, the PDFium library is embedded in the binary
3
+ //! When the `bundled-pdfium` feature is enabled, the PDFium library is embedded in the binary
4
4
  //! using `include_bytes!` during compilation. This module handles runtime extraction to a
5
5
  //! temporary directory and provides the path for dynamic loading.
6
6
  //!
@@ -15,7 +15,7 @@
15
15
  //! # Example
16
16
  //!
17
17
  //! ```rust,ignore
18
- //! # #[cfg(feature = "pdf-bundled")]
18
+ //! # #[cfg(feature = "bundled-pdfium")]
19
19
  //! # {
20
20
  //! use kreuzberg::pdf::bundled::extract_bundled_pdfium;
21
21
  //!
@@ -280,7 +280,7 @@ mod tests {
280
280
  }
281
281
 
282
282
  #[test]
283
- #[cfg(feature = "pdf-bundled")]
283
+ #[cfg(feature = "bundled-pdfium")]
284
284
  fn test_extract_bundled_pdfium() {
285
285
  let result = extract_bundled_pdfium();
286
286
  assert!(result.is_ok());
@@ -299,7 +299,7 @@ mod tests {
299
299
  }
300
300
 
301
301
  #[test]
302
- #[cfg(feature = "pdf-bundled")]
302
+ #[cfg(feature = "bundled-pdfium")]
303
303
  fn test_extract_bundled_pdfium_reuses_existing() {
304
304
  // First extraction
305
305
  let result1 = extract_bundled_pdfium();
@@ -326,7 +326,7 @@ mod tests {
326
326
 
327
327
  #[test]
328
328
  #[cfg(unix)]
329
- #[cfg(feature = "pdf-bundled")]
329
+ #[cfg(feature = "bundled-pdfium")]
330
330
  fn test_extract_bundled_pdfium_permissions() {
331
331
  let result = extract_bundled_pdfium();
332
332
  assert!(result.is_ok());
@@ -37,7 +37,7 @@
37
37
  //! functionality in the PDF extractor for rendering pages to images.
38
38
  #[cfg(feature = "pdf")]
39
39
  pub(crate) mod bindings;
40
- #[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
40
+ #[cfg(all(feature = "pdf", feature = "bundled-pdfium"))]
41
41
  pub mod bundled;
42
42
  #[cfg(feature = "pdf")]
43
43
  pub mod error;
@@ -52,7 +52,7 @@ pub mod table;
52
52
  #[cfg(feature = "pdf")]
53
53
  pub mod text;
54
54
 
55
- #[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
55
+ #[cfg(all(feature = "pdf", feature = "bundled-pdfium"))]
56
56
  pub use bundled::extract_bundled_pdfium;
57
57
  #[cfg(feature = "pdf")]
58
58
  pub use error::PdfError;
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.0.0-rc.11"
3
+ version = "4.0.0-rc.14"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -46,7 +46,7 @@ serde_json = "1.0.145"
46
46
  serde = { version = "1.0.228", features = ["derive"] }
47
47
  async-trait = "0.1.89"
48
48
  tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
49
- html-to-markdown-rs = { version = "2.14.11", default-features = false }
49
+ html-to-markdown-rs = { version = "2.15.0", default-features = false }
50
50
 
51
51
  [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
52
52
  kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
@@ -54,7 +54,7 @@ serde_json = "1.0.145"
54
54
  serde = { version = "1.0.228", features = ["derive"] }
55
55
  async-trait = "0.1.89"
56
56
  tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
57
- html-to-markdown-rs = { version = "2.14.11", default-features = false }
57
+ html-to-markdown-rs = { version = "2.15.0", default-features = false }
58
58
 
59
59
  [build-dependencies]
60
60
  cbindgen = "0.29"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.0.0-rc.11"
3
+ version = "4.0.0-rc.14"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -24,13 +24,13 @@ thiserror = "2.0.17"
24
24
  image = { workspace = true }
25
25
 
26
26
  [build-dependencies]
27
- cc = { version = "1.2.49", optional = true }
27
+ cc = { version = "1.2.50", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
29
  reqwest = { version = "0.12.25", default-features = false, features = [
30
30
  "blocking",
31
31
  "rustls-tls",
32
32
  ], optional = true }
33
- zip = { version = "6.0.0", optional = true }
33
+ zip = { version = "7.0.0", optional = true }
34
34
 
35
35
  [features]
36
36
  default = ["static-linking"]
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.0.pre.rc.11
4
+ version: 4.0.0.pre.rc.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2025-12-19 00:00:00.000000000 Z
10
+ date: 2025-12-20 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: bundler
@@ -237,7 +236,6 @@ files:
237
236
  - vendor/kreuzberg-ffi/README.md
238
237
  - vendor/kreuzberg-ffi/build.rs
239
238
  - vendor/kreuzberg-ffi/cbindgen.toml
240
- - vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc
241
239
  - vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in
242
240
  - vendor/kreuzberg-ffi/kreuzberg.h
243
241
  - vendor/kreuzberg-ffi/src/lib.rs
@@ -546,14 +544,13 @@ homepage: https://github.com/kreuzberg-dev/kreuzberg
546
544
  licenses:
547
545
  - MIT
548
546
  metadata:
549
- homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
550
- source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
547
+ bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
551
548
  changelog_uri: https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md
552
549
  documentation_uri: https://docs.kreuzberg.dev
553
- bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
554
- rubygems_mfa_required: 'true'
550
+ homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
555
551
  keywords: document-intelligence,document-extraction,ocr,rust,bindings
556
- post_install_message:
552
+ rubygems_mfa_required: 'true'
553
+ source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
557
554
  rdoc_options: []
558
555
  require_paths:
559
556
  - lib
@@ -568,8 +565,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
568
565
  - !ruby/object:Gem::Version
569
566
  version: '0'
570
567
  requirements: []
571
- rubygems_version: 3.5.22
572
- signing_key:
568
+ rubygems_version: 4.0.2
573
569
  specification_version: 4
574
570
  summary: High-performance document intelligence framework
575
571
  test_files: []
@@ -1,12 +0,0 @@
1
- prefix=/usr/local
2
- exec_prefix=${prefix}
3
- libdir=${exec_prefix}/lib
4
- includedir=${prefix}/include
5
-
6
- Name: kreuzberg-ffi
7
- Description: C FFI bindings for Kreuzberg document intelligence library
8
- Version: 4.0.0-rc.11
9
- URL: https://kreuzberg.dev
10
- Libs: -L${libdir} -lkreuzberg_ffi
11
- Libs.private: -framework CoreFoundation -framework Security -lpthread
12
- Cflags: -I${includedir}