kreuzberg 4.3.2 → 4.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +3 -3
  7. data/vendor/kreuzberg/Cargo.toml +3 -3
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/examples/download_paddle_models.rs +47 -29
  10. data/vendor/kreuzberg/src/extraction/docx/drawing.rs +571 -0
  11. data/vendor/kreuzberg/src/extraction/docx/mod.rs +20 -0
  12. data/vendor/kreuzberg/src/extraction/docx/parser.rs +1210 -283
  13. data/vendor/kreuzberg/src/extraction/docx/section.rs +497 -0
  14. data/vendor/kreuzberg/src/extraction/docx/styles.rs +1534 -0
  15. data/vendor/kreuzberg/src/extraction/docx/table.rs +1038 -0
  16. data/vendor/kreuzberg/src/extraction/docx/theme.rs +490 -0
  17. data/vendor/kreuzberg/src/extraction/image_format.rs +98 -0
  18. data/vendor/kreuzberg/src/extraction/image_ocr.rs +75 -0
  19. data/vendor/kreuzberg/src/extraction/mod.rs +9 -0
  20. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +3 -3
  21. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +1 -1
  22. data/vendor/kreuzberg/src/extraction/ooxml_constants.rs +45 -0
  23. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +1 -15
  24. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +3 -3
  25. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +46 -50
  26. data/vendor/kreuzberg/src/extraction/structured.rs +51 -6
  27. data/vendor/kreuzberg/src/extraction/xml.rs +44 -12
  28. data/vendor/kreuzberg/src/extractors/docx.rs +1373 -97
  29. data/vendor/kreuzberg/src/extractors/excel.rs +2 -2
  30. data/vendor/kreuzberg/src/extractors/jupyter.rs +135 -34
  31. data/vendor/kreuzberg/src/extractors/latex/commands.rs +101 -4
  32. data/vendor/kreuzberg/src/extractors/latex/parser.rs +70 -10
  33. data/vendor/kreuzberg/src/extractors/markdown.rs +261 -12
  34. data/vendor/kreuzberg/src/extractors/pptx.rs +4 -66
  35. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +137 -214
  36. data/vendor/kreuzberg/src/paddle_ocr/config.rs +55 -0
  37. data/vendor/kreuzberg/src/paddle_ocr/mod.rs +61 -9
  38. data/vendor/kreuzberg/src/paddle_ocr/model_manager.rs +465 -384
  39. data/vendor/kreuzberg/src/types/metadata.rs +40 -0
  40. data/vendor/kreuzberg/tests/docx_formatting_test.rs +9 -9
  41. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +7 -7
  42. data/vendor/kreuzberg/tests/paddle_ocr_multilang.rs +640 -0
  43. data/vendor/kreuzberg/tests/pdf_text_merging.rs +8 -0
  44. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +33 -8
  45. data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
  46. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  47. data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +1 -1
  48. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  49. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  50. metadata +11 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a66ef8cb3e024db9b96d346dbc21f2b255cafc56baaede065078cbc225cf571d
4
- data.tar.gz: 5ad2fec77bf30f7c91ef8c1618ad1ecdad6714cbfc78046c77a8d5631647e29d
3
+ metadata.gz: 801d80170e744ae0641ce1fd08c3f09d2bed1a88e41a1ffc533a7d8d8eafe391
4
+ data.tar.gz: 58437edacbe5d45d45b95695fd75992559ab2f3120d4b69ddfd7153e0adfdd18
5
5
  SHA512:
6
- metadata.gz: f86b3fa11ca2d638fe3102f5f64757914664f169ffc8d59008d79d7b32d19b06fe65ec609ae22f43bcf4d000260c8a427d15e08cc5135e2d47db14b4ad060ccf
7
- data.tar.gz: fc1b33c27e0159999a1e19baf91156325a6959dfde9c9a99e07dc616749ceec7f04ee54c6cbb29b4fd9acd83717012465b17393518f3cda8fa6461c1f66fe49b
6
+ metadata.gz: 6e7a486189ce2ce1878ef898a9d7784a95666a673177e66e1921ed67f33d82cfb735356890e9dff449c404a45480f94c34794c6c97988a57f59e28d799d43a8b
7
+ data.tar.gz: 7951e6aaa0c882dd65c702bff65cce5980401a2a31423707b38a9f63d4c8983d919fcfe50b8843ac1e908e9e754d6c2f7e2e52d7b64a66a762fbeeceedb41def
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.3.2)
4
+ kreuzberg (4.3.3)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -123,7 +123,7 @@ GEM
123
123
  rubocop (~> 1.81)
124
124
  ruby-progressbar (1.13.0)
125
125
  securerandom (0.4.1)
126
- sorbet-runtime (0.6.12938)
126
+ sorbet-runtime (0.6.12940)
127
127
  steep (1.10.0)
128
128
  activesupport (>= 5.1)
129
129
  concurrent-ruby (>= 1.1.10)
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
- kreuzberg (4.3.2)
212
+ kreuzberg (4.3.3)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -244,7 +244,7 @@ CHECKSUMS
244
244
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
245
245
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
246
246
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
247
- sorbet-runtime (0.6.12938) sha256=698a95c71c94beb1d844b78d14d38765bdd423f2a58d05853809fe86a3c56e0d
247
+ sorbet-runtime (0.6.12940) sha256=e935193a35cc5fc92d0bbe60db50fbdfbb787052a3cf01b2daa7af4cecb318cd
248
248
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
249
249
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
250
250
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.3.2"
40
+ version = "4.3.3"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.3.2'
4
+ VERSION = '4.3.3'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.3.2"
5
+ version = "4.3.3"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -22,7 +22,7 @@ getrandom = { version = "0.4.1", features = ["wasm_js"] }
22
22
  hex = "0.4.3"
23
23
  html-to-markdown-rs = { version = "2.24.5", default-features = false }
24
24
  image = { version = "0.25.9", default-features = false }
25
- libc = "0.2.181"
25
+ libc = "0.2.182"
26
26
  lzma-rust2 = { version = "0.15.7" }
27
27
  num_cpus = "1.17.0"
28
28
  once_cell = "1.21.3"
@@ -34,5 +34,5 @@ serde_json = { version = "1.0.149" }
34
34
  tempfile = "3.25.0"
35
35
  thiserror = "2.0.18"
36
36
  tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
37
- toml = "1.0.0"
37
+ toml = "1.0.1"
38
38
  tracing = "0.1"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.3.2"
3
+ version = "4.3.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -142,7 +142,7 @@ dirs = "6.0"
142
142
  simdutf8 = { version = "0.1", optional = true }
143
143
  hex = "0.4.3"
144
144
  lazy_static = "1.5.0"
145
- libc = "0.2.181"
145
+ libc = "0.2.182"
146
146
  memchr = "2.8.0"
147
147
  num_cpus = "1.17.0"
148
148
  once_cell = "1.21.3"
@@ -154,7 +154,7 @@ serde = { version = "1.0.228", features = ["derive"] }
154
154
  serde_json = { version = "1.0.149" }
155
155
  serde_yaml_ng = "0.10.0"
156
156
  jotdown = "0.9"
157
- toml = "1.0.0"
157
+ toml = "1.0.1"
158
158
  mime_guess = "2.0"
159
159
  rmp-serde = "1.3"
160
160
  thiserror = "2.0.18"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.3.2 Release**
20
+ > **🚀 Version 4.3.3 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -4,17 +4,6 @@
4
4
  //! and cache ONNX models locally. This is useful for offline applications or
5
5
  //! pre-warming the model cache before starting document extraction.
6
6
  //!
7
- //! # Security Notice
8
- //!
9
- //! **IMPORTANT**: The PaddleOCR models are currently downloaded without SHA256
10
- //! checksum verification. The model definitions in `paddle_ocr/model_manager.rs`
11
- //! contain empty checksum strings (lines 59, 66, 73) with a note stating:
12
- //! "Skip checksum for now - will be updated with actual checksums".
13
- //!
14
- //! This is a security concern for production use. Models should be verified
15
- //! against their known cryptographic signatures before use. See the model manager
16
- //! module for implementation details and to track when checksums are added.
17
- //!
18
7
  //! # Usage
19
8
  //!
20
9
  //! ```sh
@@ -36,14 +25,23 @@
36
25
  //!
37
26
  //! # Language Support
38
27
  //!
39
- //! The current implementation downloads fixed model sets optimized for:
40
- //! - Detection (PP-OCRv4 English)
41
- //! - Classification (MobileNet v2.0 Chinese/Universal)
42
- //! - Recognition (PP-OCRv4 English)
28
+ //! This implementation supports 12 script families covering 106+ languages:
29
+ //! - **English**: English-optimized recognition models
30
+ //! - **Chinese**: Simplified and Traditional Chinese
31
+ //! - **Latin**: European languages using Latin script
32
+ //! - **Korean**: Hangul script
33
+ //! - **Eslav**: Cyrillic-based languages (Russian, Ukrainian, etc.)
34
+ //! - **Thai**: Thai script
35
+ //! - **Greek**: Greek script
36
+ //! - **Arabic**: Arabic and Persian scripts
37
+ //! - **Devanagari**: Hindi and related scripts
38
+ //! - **Tamil**: Tamil script
39
+ //! - **Telugu**: Telugu script
40
+ //! - **Kannada**: Kannada script
43
41
  //!
44
- //! Language-specific model selection is not yet implemented in the ModelManager.
45
- //! To use models for other languages, you would need to manually download from
46
- //! the PaddleOCR model repository and configure custom model paths.
42
+ //! Models are downloaded on-demand per script family. The English recognition model
43
+ //! and dictionary are downloaded by default. Other language families are automatically
44
+ //! downloaded when needed during document processing.
47
45
  //!
48
46
  //! # Examples
49
47
  //!
@@ -63,6 +61,7 @@
63
61
  //! println!("Detection model: {:?}", models.det_model);
64
62
  //! println!("Classification model: {:?}", models.cls_model);
65
63
  //! println!("Recognition model: {:?}", models.rec_model);
64
+ //! println!("Dictionary file: {:?}", models.dict_file);
66
65
  //!
67
66
  //! // Show cache statistics
68
67
  //! let stats = manager.cache_stats()?;
@@ -173,13 +172,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
173
172
  println!("Models being downloaded:");
174
173
  println!(" - Detection model (PP-OCRv4 det)");
175
174
  println!(" - Classification model (Mobile v2.0 cls)");
176
- println!(" - Recognition model (PP-OCRv4 rec)");
177
- println!("\nWARNING: Models are downloaded without checksum verification.");
178
- println!("For production use, verify model integrity independently.\n");
179
-
180
- // SECURITY: Download and ensure models exist
181
- // NOTE: SHA256 checksums are currently empty in model_manager.rs
182
- // This should be updated with actual checksums before production deployment
175
+ println!(" - Recognition model (PP-OCRv4 rec, English)");
176
+ println!(" - Dictionary file (for text recognition)\n");
177
+ println!("Additional language family models are downloaded on-demand.");
178
+ println!("Supported families: English, Chinese, Latin, Korean, Eslav, Thai,");
179
+ println!("Greek, Arabic, Devanagari, Tamil, Telugu, Kannada.\n");
180
+
181
+ // Download and ensure models exist
182
+ // SHA256 checksums are now embedded and verified automatically
183
183
  match manager.ensure_models_exist() {
184
184
  Ok(paths) => {
185
185
  println!("\nModels downloaded successfully!\n");
@@ -187,6 +187,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
187
187
  println!(" Detection: {}", paths.det_model.display());
188
188
  println!(" Classification: {}", paths.cls_model.display());
189
189
  println!(" Recognition: {}", paths.rec_model.display());
190
+ println!(" Dictionary: {}", paths.dict_file.display());
190
191
  }
191
192
  Err(e) => {
192
193
  eprintln!("Error downloading models: {}", e);
@@ -285,9 +286,10 @@ fn print_usage(program_name: &str) {
285
286
  println!(" --help, -h Print this help message");
286
287
  println!();
287
288
  println!("NOTES:");
288
- println!(" Language-specific model selection is not yet supported.");
289
- println!(" Models downloaded are optimized for English/Chinese OCR.");
290
- println!(" See example documentation for security considerations.");
289
+ println!(" Language-specific models are supported and downloaded on-demand.");
290
+ println!(" Supported script families: English, Chinese, Latin, Korean, Eslav,");
291
+ println!(" Thai, Greek, Arabic, Devanagari, Tamil, Telugu, Kannada.");
292
+ println!(" See example documentation for language support details.");
291
293
  println!();
292
294
  println!("EXAMPLES:");
293
295
  println!(" {} --cache-dir /tmp/models", program_name);
@@ -310,7 +312,7 @@ fn list_cache_contents(cache_dir: &PathBuf) -> Result<(), Box<dyn std::error::Er
310
312
  if path.is_dir() {
311
313
  println!(" [DIR] {}/", file_name.to_string_lossy());
312
314
 
313
- // List files in subdirectory
315
+ // List files in subdirectory (2 levels deep for most, 3 for rec/)
314
316
  for sub_entry in fs::read_dir(&path)? {
315
317
  let sub_entry = sub_entry?;
316
318
  let sub_path = sub_entry.path();
@@ -321,7 +323,23 @@ fn list_cache_contents(cache_dir: &PathBuf) -> Result<(), Box<dyn std::error::Er
321
323
  let size_kb = metadata.len() as f64 / 1000.0;
322
324
  println!(" - {} ({:.1} KB)", sub_name.to_string_lossy(), size_kb);
323
325
  } else if sub_path.is_dir() {
326
+ // For rec/ directory, we have 3 levels: rec/{family}/{files}
324
327
  println!(" [DIR] {}/", sub_name.to_string_lossy());
328
+
329
+ // List files in the third level
330
+ for third_entry in fs::read_dir(&sub_path)? {
331
+ let third_entry = third_entry?;
332
+ let third_path = third_entry.path();
333
+ let third_name = third_entry.file_name();
334
+
335
+ if third_path.is_file() {
336
+ let metadata = fs::metadata(&third_path)?;
337
+ let size_kb = metadata.len() as f64 / 1000.0;
338
+ println!(" - {} ({:.1} KB)", third_name.to_string_lossy(), size_kb);
339
+ } else if third_path.is_dir() {
340
+ println!(" [DIR] {}/", third_name.to_string_lossy());
341
+ }
342
+ }
325
343
  }
326
344
  }
327
345
  } else if path.is_file() {