kreuzberg 4.3.2 → 4.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/examples/download_paddle_models.rs +47 -29
- data/vendor/kreuzberg/src/extraction/docx/drawing.rs +571 -0
- data/vendor/kreuzberg/src/extraction/docx/mod.rs +20 -0
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +1210 -283
- data/vendor/kreuzberg/src/extraction/docx/section.rs +497 -0
- data/vendor/kreuzberg/src/extraction/docx/styles.rs +1534 -0
- data/vendor/kreuzberg/src/extraction/docx/table.rs +1038 -0
- data/vendor/kreuzberg/src/extraction/docx/theme.rs +490 -0
- data/vendor/kreuzberg/src/extraction/image_format.rs +98 -0
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +75 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +9 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +3 -3
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +1 -1
- data/vendor/kreuzberg/src/extraction/ooxml_constants.rs +45 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +1 -15
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +3 -3
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +46 -50
- data/vendor/kreuzberg/src/extraction/structured.rs +51 -6
- data/vendor/kreuzberg/src/extraction/xml.rs +44 -12
- data/vendor/kreuzberg/src/extractors/docx.rs +1373 -97
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -2
- data/vendor/kreuzberg/src/extractors/jupyter.rs +135 -34
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +101 -4
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +70 -10
- data/vendor/kreuzberg/src/extractors/markdown.rs +261 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +4 -66
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +137 -214
- data/vendor/kreuzberg/src/paddle_ocr/config.rs +55 -0
- data/vendor/kreuzberg/src/paddle_ocr/mod.rs +61 -9
- data/vendor/kreuzberg/src/paddle_ocr/model_manager.rs +465 -384
- data/vendor/kreuzberg/src/types/metadata.rs +40 -0
- data/vendor/kreuzberg/tests/docx_formatting_test.rs +9 -9
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +7 -7
- data/vendor/kreuzberg/tests/paddle_ocr_multilang.rs +640 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +8 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +33 -8
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- metadata +11 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 801d80170e744ae0641ce1fd08c3f09d2bed1a88e41a1ffc533a7d8d8eafe391
|
|
4
|
+
data.tar.gz: 58437edacbe5d45d45b95695fd75992559ab2f3120d4b69ddfd7153e0adfdd18
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6e7a486189ce2ce1878ef898a9d7784a95666a673177e66e1921ed67f33d82cfb735356890e9dff449c404a45480f94c34794c6c97988a57f59e28d799d43a8b
|
|
7
|
+
data.tar.gz: 7951e6aaa0c882dd65c702bff65cce5980401a2a31423707b38a9f63d4c8983d919fcfe50b8843ac1e908e9e754d6c2f7e2e52d7b64a66a762fbeeceedb41def
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.3.
|
|
4
|
+
kreuzberg (4.3.3)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -123,7 +123,7 @@ GEM
|
|
|
123
123
|
rubocop (~> 1.81)
|
|
124
124
|
ruby-progressbar (1.13.0)
|
|
125
125
|
securerandom (0.4.1)
|
|
126
|
-
sorbet-runtime (0.6.
|
|
126
|
+
sorbet-runtime (0.6.12940)
|
|
127
127
|
steep (1.10.0)
|
|
128
128
|
activesupport (>= 5.1)
|
|
129
129
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.3.
|
|
212
|
+
kreuzberg (4.3.3)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -244,7 +244,7 @@ CHECKSUMS
|
|
|
244
244
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
245
245
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
246
246
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
247
|
-
sorbet-runtime (0.6.
|
|
247
|
+
sorbet-runtime (0.6.12940) sha256=e935193a35cc5fc92d0bbe60db50fbdfbb787052a3cf01b2daa7af4cecb318cd
|
|
248
248
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
249
249
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
250
250
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.3.
|
|
5
|
+
version = "4.3.3"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -22,7 +22,7 @@ getrandom = { version = "0.4.1", features = ["wasm_js"] }
|
|
|
22
22
|
hex = "0.4.3"
|
|
23
23
|
html-to-markdown-rs = { version = "2.24.5", default-features = false }
|
|
24
24
|
image = { version = "0.25.9", default-features = false }
|
|
25
|
-
libc = "0.2.
|
|
25
|
+
libc = "0.2.182"
|
|
26
26
|
lzma-rust2 = { version = "0.15.7" }
|
|
27
27
|
num_cpus = "1.17.0"
|
|
28
28
|
once_cell = "1.21.3"
|
|
@@ -34,5 +34,5 @@ serde_json = { version = "1.0.149" }
|
|
|
34
34
|
tempfile = "3.25.0"
|
|
35
35
|
thiserror = "2.0.18"
|
|
36
36
|
tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
37
|
-
toml = "1.0.
|
|
37
|
+
toml = "1.0.1"
|
|
38
38
|
tracing = "0.1"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.3.
|
|
3
|
+
version = "4.3.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -142,7 +142,7 @@ dirs = "6.0"
|
|
|
142
142
|
simdutf8 = { version = "0.1", optional = true }
|
|
143
143
|
hex = "0.4.3"
|
|
144
144
|
lazy_static = "1.5.0"
|
|
145
|
-
libc = "0.2.
|
|
145
|
+
libc = "0.2.182"
|
|
146
146
|
memchr = "2.8.0"
|
|
147
147
|
num_cpus = "1.17.0"
|
|
148
148
|
once_cell = "1.21.3"
|
|
@@ -154,7 +154,7 @@ serde = { version = "1.0.228", features = ["derive"] }
|
|
|
154
154
|
serde_json = { version = "1.0.149" }
|
|
155
155
|
serde_yaml_ng = "0.10.0"
|
|
156
156
|
jotdown = "0.9"
|
|
157
|
-
toml = "1.0.
|
|
157
|
+
toml = "1.0.1"
|
|
158
158
|
mime_guess = "2.0"
|
|
159
159
|
rmp-serde = "1.3"
|
|
160
160
|
thiserror = "2.0.18"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.3.
|
|
20
|
+
> **🚀 Version 4.3.3 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -4,17 +4,6 @@
|
|
|
4
4
|
//! and cache ONNX models locally. This is useful for offline applications or
|
|
5
5
|
//! pre-warming the model cache before starting document extraction.
|
|
6
6
|
//!
|
|
7
|
-
//! # Security Notice
|
|
8
|
-
//!
|
|
9
|
-
//! **IMPORTANT**: The PaddleOCR models are currently downloaded without SHA256
|
|
10
|
-
//! checksum verification. The model definitions in `paddle_ocr/model_manager.rs`
|
|
11
|
-
//! contain empty checksum strings (lines 59, 66, 73) with a note stating:
|
|
12
|
-
//! "Skip checksum for now - will be updated with actual checksums".
|
|
13
|
-
//!
|
|
14
|
-
//! This is a security concern for production use. Models should be verified
|
|
15
|
-
//! against their known cryptographic signatures before use. See the model manager
|
|
16
|
-
//! module for implementation details and to track when checksums are added.
|
|
17
|
-
//!
|
|
18
7
|
//! # Usage
|
|
19
8
|
//!
|
|
20
9
|
//! ```sh
|
|
@@ -36,14 +25,23 @@
|
|
|
36
25
|
//!
|
|
37
26
|
//! # Language Support
|
|
38
27
|
//!
|
|
39
|
-
//!
|
|
40
|
-
//! -
|
|
41
|
-
//! -
|
|
42
|
-
//! -
|
|
28
|
+
//! This implementation supports 12 script families covering 106+ languages:
|
|
29
|
+
//! - **English**: English-optimized recognition models
|
|
30
|
+
//! - **Chinese**: Simplified and Traditional Chinese
|
|
31
|
+
//! - **Latin**: European languages using Latin script
|
|
32
|
+
//! - **Korean**: Hangul script
|
|
33
|
+
//! - **Eslav**: Cyrillic-based languages (Russian, Ukrainian, etc.)
|
|
34
|
+
//! - **Thai**: Thai script
|
|
35
|
+
//! - **Greek**: Greek script
|
|
36
|
+
//! - **Arabic**: Arabic and Persian scripts
|
|
37
|
+
//! - **Devanagari**: Hindi and related scripts
|
|
38
|
+
//! - **Tamil**: Tamil script
|
|
39
|
+
//! - **Telugu**: Telugu script
|
|
40
|
+
//! - **Kannada**: Kannada script
|
|
43
41
|
//!
|
|
44
|
-
//!
|
|
45
|
-
//!
|
|
46
|
-
//!
|
|
42
|
+
//! Models are downloaded on-demand per script family. The English recognition model
|
|
43
|
+
//! and dictionary are downloaded by default. Other language families are automatically
|
|
44
|
+
//! downloaded when needed during document processing.
|
|
47
45
|
//!
|
|
48
46
|
//! # Examples
|
|
49
47
|
//!
|
|
@@ -63,6 +61,7 @@
|
|
|
63
61
|
//! println!("Detection model: {:?}", models.det_model);
|
|
64
62
|
//! println!("Classification model: {:?}", models.cls_model);
|
|
65
63
|
//! println!("Recognition model: {:?}", models.rec_model);
|
|
64
|
+
//! println!("Dictionary file: {:?}", models.dict_file);
|
|
66
65
|
//!
|
|
67
66
|
//! // Show cache statistics
|
|
68
67
|
//! let stats = manager.cache_stats()?;
|
|
@@ -173,13 +172,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
173
172
|
println!("Models being downloaded:");
|
|
174
173
|
println!(" - Detection model (PP-OCRv4 det)");
|
|
175
174
|
println!(" - Classification model (Mobile v2.0 cls)");
|
|
176
|
-
println!(" - Recognition model (PP-OCRv4 rec)");
|
|
177
|
-
println!("
|
|
178
|
-
println!("
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
//
|
|
175
|
+
println!(" - Recognition model (PP-OCRv4 rec, English)");
|
|
176
|
+
println!(" - Dictionary file (for text recognition)\n");
|
|
177
|
+
println!("Additional language family models are downloaded on-demand.");
|
|
178
|
+
println!("Supported families: English, Chinese, Latin, Korean, Eslav, Thai,");
|
|
179
|
+
println!("Greek, Arabic, Devanagari, Tamil, Telugu, Kannada.\n");
|
|
180
|
+
|
|
181
|
+
// Download and ensure models exist
|
|
182
|
+
// SHA256 checksums are now embedded and verified automatically
|
|
183
183
|
match manager.ensure_models_exist() {
|
|
184
184
|
Ok(paths) => {
|
|
185
185
|
println!("\nModels downloaded successfully!\n");
|
|
@@ -187,6 +187,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
187
187
|
println!(" Detection: {}", paths.det_model.display());
|
|
188
188
|
println!(" Classification: {}", paths.cls_model.display());
|
|
189
189
|
println!(" Recognition: {}", paths.rec_model.display());
|
|
190
|
+
println!(" Dictionary: {}", paths.dict_file.display());
|
|
190
191
|
}
|
|
191
192
|
Err(e) => {
|
|
192
193
|
eprintln!("Error downloading models: {}", e);
|
|
@@ -285,9 +286,10 @@ fn print_usage(program_name: &str) {
|
|
|
285
286
|
println!(" --help, -h Print this help message");
|
|
286
287
|
println!();
|
|
287
288
|
println!("NOTES:");
|
|
288
|
-
println!(" Language-specific
|
|
289
|
-
println!("
|
|
290
|
-
println!("
|
|
289
|
+
println!(" Language-specific models are supported and downloaded on-demand.");
|
|
290
|
+
println!(" Supported script families: English, Chinese, Latin, Korean, Eslav,");
|
|
291
|
+
println!(" Thai, Greek, Arabic, Devanagari, Tamil, Telugu, Kannada.");
|
|
292
|
+
println!(" See example documentation for language support details.");
|
|
291
293
|
println!();
|
|
292
294
|
println!("EXAMPLES:");
|
|
293
295
|
println!(" {} --cache-dir /tmp/models", program_name);
|
|
@@ -310,7 +312,7 @@ fn list_cache_contents(cache_dir: &PathBuf) -> Result<(), Box<dyn std::error::Er
|
|
|
310
312
|
if path.is_dir() {
|
|
311
313
|
println!(" [DIR] {}/", file_name.to_string_lossy());
|
|
312
314
|
|
|
313
|
-
// List files in subdirectory
|
|
315
|
+
// List files in subdirectory (2 levels deep for most, 3 for rec/)
|
|
314
316
|
for sub_entry in fs::read_dir(&path)? {
|
|
315
317
|
let sub_entry = sub_entry?;
|
|
316
318
|
let sub_path = sub_entry.path();
|
|
@@ -321,7 +323,23 @@ fn list_cache_contents(cache_dir: &PathBuf) -> Result<(), Box<dyn std::error::Er
|
|
|
321
323
|
let size_kb = metadata.len() as f64 / 1000.0;
|
|
322
324
|
println!(" - {} ({:.1} KB)", sub_name.to_string_lossy(), size_kb);
|
|
323
325
|
} else if sub_path.is_dir() {
|
|
326
|
+
// For rec/ directory, we have 3 levels: rec/{family}/{files}
|
|
324
327
|
println!(" [DIR] {}/", sub_name.to_string_lossy());
|
|
328
|
+
|
|
329
|
+
// List files in the third level
|
|
330
|
+
for third_entry in fs::read_dir(&sub_path)? {
|
|
331
|
+
let third_entry = third_entry?;
|
|
332
|
+
let third_path = third_entry.path();
|
|
333
|
+
let third_name = third_entry.file_name();
|
|
334
|
+
|
|
335
|
+
if third_path.is_file() {
|
|
336
|
+
let metadata = fs::metadata(&third_path)?;
|
|
337
|
+
let size_kb = metadata.len() as f64 / 1000.0;
|
|
338
|
+
println!(" - {} ({:.1} KB)", third_name.to_string_lossy(), size_kb);
|
|
339
|
+
} else if third_path.is_dir() {
|
|
340
|
+
println!(" [DIR] {}/", third_name.to_string_lossy());
|
|
341
|
+
}
|
|
342
|
+
}
|
|
325
343
|
}
|
|
326
344
|
}
|
|
327
345
|
} else if path.is_file() {
|