kreuzberg 4.1.0 → 4.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +1 -1
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +33 -2
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/fixtures/config.toml +1 -1
- data/spec/fixtures/config.yaml +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +4 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/mime.rs +15 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +17 -8
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +6 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6550daabf58e5e396576e5a83c6a53f226e677f9c129920c9990bba309fbd7ba
|
|
4
|
+
data.tar.gz: 9595aa468666391d08a0962db589bbbc50d5bd1c8532e101efa234f6c523d7c5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0dea911deebe061515dd4cbff2b76b3a7947c68f196fcc576001d42d80386f6c53f8ed63e0e4acb8e719ad6f95c21e689df7aef5f6cbbbc0d1c92ef96ddb673c
|
|
7
|
+
data.tar.gz: 0df091f80f7c73dda0c17d89d4aa0571cd01f0f2b697b187fd9bae28f8dbcf96cd2e3a269f9831a442b8cf46ce40608586d3d6a242d84bb394fe6056cba3b492
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.1.
|
|
4
|
+
kreuzberg (4.1.2)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -121,7 +121,7 @@ GEM
|
|
|
121
121
|
rubocop (~> 1.81)
|
|
122
122
|
ruby-progressbar (1.13.0)
|
|
123
123
|
securerandom (0.4.1)
|
|
124
|
-
sorbet-runtime (0.6.
|
|
124
|
+
sorbet-runtime (0.6.12897)
|
|
125
125
|
steep (1.10.0)
|
|
126
126
|
activesupport (>= 5.1)
|
|
127
127
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.1.
|
|
210
|
+
kreuzberg (4.1.2)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -242,7 +242,7 @@ CHECKSUMS
|
|
|
242
242
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
243
243
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
244
244
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
245
|
-
sorbet-runtime (0.6.
|
|
245
|
+
sorbet-runtime (0.6.12897) sha256=0348ab8803c4c3646977fee298083ded9b7e74d5b34b50c567c63eb7e36eb286
|
|
246
246
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
247
247
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
248
248
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -86,10 +86,13 @@ gem 'kreuzberg'
|
|
|
86
86
|
|
|
87
87
|
### System Requirements
|
|
88
88
|
|
|
89
|
-
- **Ruby 2.
|
|
89
|
+
- **Ruby 3.2.0 or higher** required (including Ruby 4.x)
|
|
90
|
+
- Ruby 4.0+ is fully supported with no code changes required
|
|
90
91
|
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
91
92
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
92
93
|
|
|
94
|
+
**Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
|
|
95
|
+
|
|
93
96
|
|
|
94
97
|
|
|
95
98
|
## Quick Start
|
|
@@ -202,9 +205,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
202
205
|
|
|
203
206
|
## Features
|
|
204
207
|
|
|
205
|
-
### Supported File Formats (
|
|
208
|
+
### Supported File Formats (57+)
|
|
206
209
|
|
|
207
|
-
|
|
210
|
+
57 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
208
211
|
|
|
209
212
|
#### Office Documents
|
|
210
213
|
|
|
@@ -230,7 +233,7 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
230
233
|
|----------|---------|----------|
|
|
231
234
|
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
232
235
|
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
233
|
-
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
236
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
|
|
234
237
|
|
|
235
238
|
#### Email & Archives
|
|
236
239
|
|
|
@@ -31,7 +31,7 @@ embeddings = ["kreuzberg/embeddings"]
|
|
|
31
31
|
|
|
32
32
|
[dependencies]
|
|
33
33
|
async-trait = "0.1.89"
|
|
34
|
-
kreuzberg = { path = "
|
|
34
|
+
kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, features = [
|
|
35
35
|
"pdf",
|
|
36
36
|
"excel",
|
|
37
37
|
"office",
|
|
@@ -51,7 +51,7 @@ kreuzberg = { path = "../../../../../crates/kreuzberg", default-features = false
|
|
|
51
51
|
"bundled-pdfium",
|
|
52
52
|
"tokio-runtime",
|
|
53
53
|
] }
|
|
54
|
-
kreuzberg-ffi = { path = "
|
|
54
|
+
kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
|
|
55
55
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
56
56
|
"rb-sys",
|
|
57
57
|
] }
|
|
@@ -1025,8 +1025,10 @@ pub fn config_from_file(path: String) -> Result<RHash, Error> {
|
|
|
1025
1025
|
.and_then(|v| magnus::RHash::try_convert(v).map_err(|_| validation_error("Config must be a Hash")))
|
|
1026
1026
|
}
|
|
1027
1027
|
|
|
1028
|
-
/// Discover extraction config from current directory
|
|
1028
|
+
/// Discover extraction config from current directory or parent directories
|
|
1029
1029
|
pub fn config_discover() -> Result<Value, Error> {
|
|
1030
|
+
use std::path::PathBuf;
|
|
1031
|
+
|
|
1030
1032
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1031
1033
|
|
|
1032
1034
|
// Search for config files in order of precedence
|
|
@@ -1038,19 +1040,27 @@ pub fn config_discover() -> Result<Value, Error> {
|
|
|
1038
1040
|
(".kreuzbergrc", "json"),
|
|
1039
1041
|
];
|
|
1040
1042
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1043
|
+
// Start from current directory and search up to parent directories
|
|
1044
|
+
let mut current_dir: Option<PathBuf> = std::env::current_dir().ok();
|
|
1045
|
+
|
|
1046
|
+
while let Some(dir) = current_dir {
|
|
1047
|
+
for (name, format) in &config_files {
|
|
1048
|
+
let config_path = dir.join(name);
|
|
1049
|
+
if let Ok(content) = fs::read_to_string(&config_path) {
|
|
1050
|
+
let json_value: serde_json::Value = match *format {
|
|
1051
|
+
"toml" => toml::from_str(&content)
|
|
1052
|
+
.map_err(|e| validation_error(format!("Invalid TOML in {}: {}", config_path.display(), e)))?,
|
|
1053
|
+
"yaml" => serde_yaml_ng::from_str(&content)
|
|
1054
|
+
.map_err(|e| validation_error(format!("Invalid YAML in {}: {}", config_path.display(), e)))?,
|
|
1055
|
+
"json" => serde_json::from_str(&content)
|
|
1056
|
+
.map_err(|e| validation_error(format!("Invalid JSON in {}: {}", config_path.display(), e)))?,
|
|
1057
|
+
_ => unreachable!(),
|
|
1058
|
+
};
|
|
1059
|
+
return json_value_to_ruby(&ruby, &json_value);
|
|
1060
|
+
}
|
|
1053
1061
|
}
|
|
1062
|
+
// Move to parent directory
|
|
1063
|
+
current_dir = dir.parent().map(|p| p.to_path_buf());
|
|
1054
1064
|
}
|
|
1055
1065
|
|
|
1056
1066
|
// Return nil if no config found
|
data/kreuzberg.gemspec
CHANGED
|
@@ -165,7 +165,7 @@ Gem::Specification.new do |spec|
|
|
|
165
165
|
DESC
|
|
166
166
|
spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
|
|
167
167
|
spec.license = 'MIT'
|
|
168
|
-
spec.required_ruby_version = '>= 3.2.0'
|
|
168
|
+
spec.required_ruby_version = '>= 3.2.0', '< 5.0'
|
|
169
169
|
|
|
170
170
|
spec.metadata = {
|
|
171
171
|
'homepage_uri' => spec.homepage,
|
data/lib/kreuzberg/api_proxy.rb
CHANGED
data/lib/kreuzberg/cli_proxy.rb
CHANGED
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -733,11 +733,42 @@ module Kreuzberg
|
|
|
733
733
|
# @example Load from YAML
|
|
734
734
|
# config = Kreuzberg::Config::Extraction.from_file("config.yaml")
|
|
735
735
|
#
|
|
736
|
+
# Keys that are allowed in the Extraction config
|
|
737
|
+
ALLOWED_KEYS = %i[
|
|
738
|
+
use_cache enable_quality_processing force_ocr ocr chunking
|
|
739
|
+
language_detection pdf_options image_extraction image_preprocessing
|
|
740
|
+
postprocessor token_reduction keywords html_options pages
|
|
741
|
+
max_concurrent_extractions
|
|
742
|
+
].freeze
|
|
743
|
+
|
|
744
|
+
# Aliases for backward compatibility
|
|
745
|
+
KEY_ALIASES = {
|
|
746
|
+
images: :image_extraction
|
|
747
|
+
}.freeze
|
|
748
|
+
|
|
736
749
|
def self.from_file(path)
|
|
737
750
|
hash = Kreuzberg._config_from_file_native(path)
|
|
738
|
-
new(**hash
|
|
751
|
+
new(**normalize_hash_keys(hash))
|
|
739
752
|
end
|
|
740
753
|
|
|
754
|
+
# Normalize hash keys from native function
|
|
755
|
+
# - Converts string keys to symbols
|
|
756
|
+
# - Maps aliased keys to their canonical names
|
|
757
|
+
# - Filters out unknown keys
|
|
758
|
+
def self.normalize_hash_keys(hash)
|
|
759
|
+
symbolized = hash.transform_keys(&:to_sym)
|
|
760
|
+
|
|
761
|
+
# Apply key aliases
|
|
762
|
+
KEY_ALIASES.each do |from, to|
|
|
763
|
+
symbolized[to] = symbolized.delete(from) if symbolized.key?(from) && !symbolized.key?(to)
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
# Filter to only allowed keys
|
|
767
|
+
symbolized.slice(*ALLOWED_KEYS)
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
private_class_method :normalize_hash_keys
|
|
771
|
+
|
|
741
772
|
# Discover configuration file in current or parent directories.
|
|
742
773
|
#
|
|
743
774
|
# Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
|
|
@@ -755,7 +786,7 @@ module Kreuzberg
|
|
|
755
786
|
hash = Kreuzberg._config_discover_native
|
|
756
787
|
return nil if hash.nil?
|
|
757
788
|
|
|
758
|
-
new(**hash
|
|
789
|
+
new(**normalize_hash_keys(hash))
|
|
759
790
|
end
|
|
760
791
|
|
|
761
792
|
def initialize(
|
data/lib/kreuzberg/mcp_proxy.rb
CHANGED
data/lib/kreuzberg/version.rb
CHANGED
data/spec/fixtures/config.toml
CHANGED
data/spec/fixtures/config.yaml
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.1.
|
|
3
|
+
version = "4.1.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -142,7 +142,7 @@ mime_guess = "2.0"
|
|
|
142
142
|
rmp-serde = "1.3"
|
|
143
143
|
thiserror = { workspace = true }
|
|
144
144
|
tokio = { workspace = true, optional = true }
|
|
145
|
-
uuid = { version = "1.
|
|
145
|
+
uuid = { version = "1.20.0", features = ["v4", "js"] }
|
|
146
146
|
indexmap = "2.13.0"
|
|
147
147
|
tracing = { workspace = true }
|
|
148
148
|
pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
|
|
@@ -174,7 +174,7 @@ rst_parser = { version = "0.4", optional = true }
|
|
|
174
174
|
fb2 = { version = "0.4", optional = true }
|
|
175
175
|
typst-syntax = { version = "0.14", optional = true }
|
|
176
176
|
|
|
177
|
-
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
177
|
+
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "4.1", optional = true }
|
|
178
178
|
image = { workspace = true, default-features = false, features = [
|
|
179
179
|
"png",
|
|
180
180
|
"jpeg",
|
|
@@ -198,7 +198,7 @@ rake = { version = "0.3.6", optional = true }
|
|
|
198
198
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
199
199
|
tower = { version = "0.5", optional = true }
|
|
200
200
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
201
|
-
rmcp = { version = "0.
|
|
201
|
+
rmcp = { version = "0.14.0", features = [
|
|
202
202
|
"server",
|
|
203
203
|
"macros",
|
|
204
204
|
"base64",
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.1.
|
|
20
|
+
> **🚀 Version 4.1.2 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -57,6 +57,11 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
57
57
|
m.insert("ods", OPENDOC_SPREADSHEET_MIME_TYPE);
|
|
58
58
|
|
|
59
59
|
m.insert("pptx", POWER_POINT_MIME_TYPE);
|
|
60
|
+
m.insert(
|
|
61
|
+
"ppsx",
|
|
62
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
63
|
+
);
|
|
64
|
+
m.insert("pptm", "application/vnd.ms-powerpoint.presentation.macroEnabled.12");
|
|
60
65
|
m.insert("ppt", LEGACY_POWERPOINT_MIME_TYPE);
|
|
61
66
|
|
|
62
67
|
m.insert("docx", DOCX_MIME_TYPE);
|
|
@@ -180,6 +185,8 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
180
185
|
|
|
181
186
|
set.insert(PDF_MIME_TYPE);
|
|
182
187
|
set.insert(POWER_POINT_MIME_TYPE);
|
|
188
|
+
set.insert("application/vnd.openxmlformats-officedocument.presentationml.slideshow"); // PPSX
|
|
189
|
+
set.insert("application/vnd.ms-powerpoint.presentation.macroEnabled.12"); // PPTM
|
|
183
190
|
set.insert(LEGACY_WORD_MIME_TYPE);
|
|
184
191
|
set.insert(LEGACY_POWERPOINT_MIME_TYPE);
|
|
185
192
|
set.insert(HTML_MIME_TYPE);
|
|
@@ -459,6 +466,14 @@ mod tests {
|
|
|
459
466
|
("test.xlsx", EXCEL_MIME_TYPE),
|
|
460
467
|
("test.xls", EXCEL_BINARY_MIME_TYPE),
|
|
461
468
|
("test.pptx", POWER_POINT_MIME_TYPE),
|
|
469
|
+
(
|
|
470
|
+
"test.ppsx",
|
|
471
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
472
|
+
),
|
|
473
|
+
(
|
|
474
|
+
"test.pptm",
|
|
475
|
+
"application/vnd.ms-powerpoint.presentation.macroEnabled.12",
|
|
476
|
+
),
|
|
462
477
|
("test.ppt", LEGACY_POWERPOINT_MIME_TYPE),
|
|
463
478
|
("test.docx", DOCX_MIME_TYPE),
|
|
464
479
|
("test.doc", LEGACY_WORD_MIME_TYPE),
|
|
@@ -60,9 +60,12 @@ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
|
|
|
60
60
|
match tag_name {
|
|
61
61
|
"sp" => {
|
|
62
62
|
let position = extract_position(node);
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
63
|
+
// parse_sp returns None for shapes without txBody (e.g., image placeholders)
|
|
64
|
+
if let Some(content) = parse_sp(node)? {
|
|
65
|
+
match content {
|
|
66
|
+
ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
|
|
67
|
+
ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
|
|
68
|
+
}
|
|
66
69
|
}
|
|
67
70
|
}
|
|
68
71
|
"graphicFrame" => {
|
|
@@ -85,11 +88,17 @@ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
|
|
|
85
88
|
Ok(elements)
|
|
86
89
|
}
|
|
87
90
|
|
|
88
|
-
fn parse_sp(sp_node: &Node) -> Result<ParsedContent
|
|
89
|
-
|
|
91
|
+
fn parse_sp(sp_node: &Node) -> Result<Option<ParsedContent>> {
|
|
92
|
+
// Some shapes like image placeholders (<p:ph type="pic"/>) don't have txBody.
|
|
93
|
+
// These should be skipped gracefully - they contain no text to extract.
|
|
94
|
+
// GitHub Issue #321 Bug 1
|
|
95
|
+
let tx_body_node = match sp_node
|
|
90
96
|
.children()
|
|
91
97
|
.find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(P_NAMESPACE))
|
|
92
|
-
|
|
98
|
+
{
|
|
99
|
+
Some(node) => node,
|
|
100
|
+
None => return Ok(None), // Skip shapes without txBody
|
|
101
|
+
};
|
|
93
102
|
|
|
94
103
|
let is_list = tx_body_node.descendants().any(|n| {
|
|
95
104
|
n.is_element()
|
|
@@ -103,9 +112,9 @@ fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
|
|
|
103
112
|
});
|
|
104
113
|
|
|
105
114
|
if is_list {
|
|
106
|
-
Ok(ParsedContent::List(parse_list(&tx_body_node)?))
|
|
115
|
+
Ok(Some(ParsedContent::List(parse_list(&tx_body_node)?)))
|
|
107
116
|
} else {
|
|
108
|
-
Ok(ParsedContent::Text(parse_text(&tx_body_node)?))
|
|
117
|
+
Ok(Some(ParsedContent::Text(parse_text(&tx_body_node)?)))
|
|
109
118
|
}
|
|
110
119
|
}
|
|
111
120
|
|
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
//! Regression tests for PPTX/PPSX extraction bugs
|
|
2
|
+
//!
|
|
3
|
+
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
|
|
4
|
+
//!
|
|
5
|
+
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
|
|
6
|
+
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
|
|
7
|
+
|
|
8
|
+
#![cfg(feature = "office")]
|
|
9
|
+
|
|
10
|
+
use kreuzberg::{ExtractionConfig, extract_file};
|
|
11
|
+
use std::io::Write;
|
|
12
|
+
use tempfile::NamedTempFile;
|
|
13
|
+
use zip::CompressionMethod;
|
|
14
|
+
use zip::write::{FileOptions, ZipWriter};
|
|
15
|
+
|
|
16
|
+
/// Test that PPSX (PowerPoint Show) files are extracted correctly.
|
|
17
|
+
///
|
|
18
|
+
/// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
|
|
19
|
+
/// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
|
|
20
|
+
///
|
|
21
|
+
/// The internal structure is identical to PPTX - same slide XML format.
|
|
22
|
+
///
|
|
23
|
+
/// GitHub Issue #321 Bug 2
|
|
24
|
+
#[tokio::test]
|
|
25
|
+
async fn test_ppsx_slideshow_extraction() {
|
|
26
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
27
|
+
.parent()
|
|
28
|
+
.unwrap()
|
|
29
|
+
.parent()
|
|
30
|
+
.unwrap();
|
|
31
|
+
let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
|
|
32
|
+
|
|
33
|
+
if !test_file.exists() {
|
|
34
|
+
println!("Skipping test: PPSX test file not found at {:?}", test_file);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;
|
|
39
|
+
|
|
40
|
+
match result {
|
|
41
|
+
Ok(extraction) => {
|
|
42
|
+
assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
|
|
43
|
+
println!("✅ PPSX extraction succeeded!");
|
|
44
|
+
println!(" Content length: {} chars", extraction.content.len());
|
|
45
|
+
println!(
|
|
46
|
+
" Content preview: {}",
|
|
47
|
+
&extraction.content[..extraction.content.len().min(200)]
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
Err(e) => {
|
|
51
|
+
panic!(
|
|
52
|
+
"PPSX extraction failed with error: {:?}\n\
|
|
53
|
+
This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
|
|
54
|
+
PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
|
|
55
|
+
needs to be added to extension-to-MIME mapping.",
|
|
56
|
+
e
|
|
57
|
+
);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/// Test that PPSX files can be extracted when MIME type is explicitly provided.
|
|
63
|
+
///
|
|
64
|
+
/// This validates that the PPTX extractor can handle PPSX content correctly
|
|
65
|
+
/// (the XML structure is identical), even if MIME detection fails.
|
|
66
|
+
///
|
|
67
|
+
/// GitHub Issue #321 Bug 2
|
|
68
|
+
#[tokio::test]
|
|
69
|
+
async fn test_ppsx_with_explicit_mime_type() {
|
|
70
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
71
|
+
.parent()
|
|
72
|
+
.unwrap()
|
|
73
|
+
.parent()
|
|
74
|
+
.unwrap();
|
|
75
|
+
let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
|
|
76
|
+
|
|
77
|
+
if !test_file.exists() {
|
|
78
|
+
println!("Skipping test: PPSX test file not found at {:?}", test_file);
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Explicitly provide the PPSX MIME type
|
|
83
|
+
let result = extract_file(
|
|
84
|
+
&test_file,
|
|
85
|
+
Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
|
|
86
|
+
&ExtractionConfig::default(),
|
|
87
|
+
)
|
|
88
|
+
.await;
|
|
89
|
+
|
|
90
|
+
match result {
|
|
91
|
+
Ok(extraction) => {
|
|
92
|
+
assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
|
|
93
|
+
println!("✅ PPSX extraction with explicit MIME type succeeded!");
|
|
94
|
+
}
|
|
95
|
+
Err(e) => {
|
|
96
|
+
panic!(
|
|
97
|
+
"PPSX extraction with explicit MIME type failed: {:?}\n\
|
|
98
|
+
The PPTX extractor should handle PPSX content (identical XML structure).",
|
|
99
|
+
e
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
|
|
106
|
+
///
|
|
107
|
+
/// Some shapes in PPTX files, like image placeholders (`<p:ph type="pic"/>`), don't have
|
|
108
|
+
/// `<p:txBody>` children because they're designed to hold images, not text.
|
|
109
|
+
///
|
|
110
|
+
/// The parser should skip shapes without txBody gracefully instead of failing.
|
|
111
|
+
///
|
|
112
|
+
/// GitHub Issue #321 Bug 1
|
|
113
|
+
#[tokio::test]
|
|
114
|
+
async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
115
|
+
// Create a minimal PPTX with a shape that has no txBody (image placeholder)
|
|
116
|
+
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
|
|
117
|
+
|
|
118
|
+
{
|
|
119
|
+
let mut zip = ZipWriter::new(&mut temp_file);
|
|
120
|
+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
121
|
+
|
|
122
|
+
// Add [Content_Types].xml
|
|
123
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
124
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
125
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
126
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
127
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
128
|
+
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
129
|
+
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
130
|
+
</Types>"#).unwrap();
|
|
131
|
+
|
|
132
|
+
// Add _rels/.rels
|
|
133
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
134
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
135
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
136
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
137
|
+
</Relationships>"#).unwrap();
|
|
138
|
+
|
|
139
|
+
// Add ppt/presentation.xml
|
|
140
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
141
|
+
zip.write_all(
|
|
142
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
143
|
+
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
144
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
145
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
|
|
146
|
+
<p:sldIdLst>
|
|
147
|
+
<p:sldId id="256" r:id="rId2"/>
|
|
148
|
+
</p:sldIdLst>
|
|
149
|
+
</p:presentation>"#,
|
|
150
|
+
)
|
|
151
|
+
.unwrap();
|
|
152
|
+
|
|
153
|
+
// Add ppt/_rels/presentation.xml.rels
|
|
154
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
155
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
156
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
157
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
158
|
+
</Relationships>"#).unwrap();
|
|
159
|
+
|
|
160
|
+
// Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
|
|
161
|
+
// This is the critical test case - a <p:sp> element with no <p:txBody>
|
|
162
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
163
|
+
zip.write_all(
|
|
164
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
165
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
166
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
167
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
168
|
+
<p:cSld>
|
|
169
|
+
<p:spTree>
|
|
170
|
+
<p:nvGrpSpPr>
|
|
171
|
+
<p:cNvPr id="1" name=""/>
|
|
172
|
+
<p:cNvGrpSpPr/>
|
|
173
|
+
<p:nvPr/>
|
|
174
|
+
</p:nvGrpSpPr>
|
|
175
|
+
<p:grpSpPr>
|
|
176
|
+
<a:xfrm>
|
|
177
|
+
<a:off x="0" y="0"/>
|
|
178
|
+
<a:ext cx="0" cy="0"/>
|
|
179
|
+
<a:chOff x="0" y="0"/>
|
|
180
|
+
<a:chExt cx="0" cy="0"/>
|
|
181
|
+
</a:xfrm>
|
|
182
|
+
</p:grpSpPr>
|
|
183
|
+
|
|
184
|
+
<!-- Normal text shape WITH txBody - this should be extracted -->
|
|
185
|
+
<p:sp>
|
|
186
|
+
<p:nvSpPr>
|
|
187
|
+
<p:cNvPr id="2" name="Title"/>
|
|
188
|
+
<p:cNvSpPr/>
|
|
189
|
+
<p:nvPr/>
|
|
190
|
+
</p:nvSpPr>
|
|
191
|
+
<p:spPr>
|
|
192
|
+
<a:xfrm>
|
|
193
|
+
<a:off x="0" y="0"/>
|
|
194
|
+
<a:ext cx="100000" cy="100000"/>
|
|
195
|
+
</a:xfrm>
|
|
196
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
197
|
+
</p:spPr>
|
|
198
|
+
<p:txBody>
|
|
199
|
+
<a:bodyPr/>
|
|
200
|
+
<a:lstStyle/>
|
|
201
|
+
<a:p>
|
|
202
|
+
<a:r>
|
|
203
|
+
<a:rPr lang="en-US"/>
|
|
204
|
+
<a:t>This is the title text</a:t>
|
|
205
|
+
</a:r>
|
|
206
|
+
</a:p>
|
|
207
|
+
</p:txBody>
|
|
208
|
+
</p:sp>
|
|
209
|
+
|
|
210
|
+
<!-- IMAGE PLACEHOLDER shape WITHOUT txBody - this caused the "No txBody found" error -->
|
|
211
|
+
<!-- This is a valid PPTX structure - image placeholders don't contain text -->
|
|
212
|
+
<p:sp>
|
|
213
|
+
<p:nvSpPr>
|
|
214
|
+
<p:cNvPr id="99" name="Image Placeholder"/>
|
|
215
|
+
<p:cNvSpPr>
|
|
216
|
+
<a:spLocks noGrp="1"/>
|
|
217
|
+
</p:cNvSpPr>
|
|
218
|
+
<p:nvPr>
|
|
219
|
+
<p:ph type="pic" idx="1"/>
|
|
220
|
+
</p:nvPr>
|
|
221
|
+
</p:nvSpPr>
|
|
222
|
+
<p:spPr>
|
|
223
|
+
<a:xfrm>
|
|
224
|
+
<a:off x="0" y="0"/>
|
|
225
|
+
<a:ext cx="100000" cy="100000"/>
|
|
226
|
+
</a:xfrm>
|
|
227
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
228
|
+
</p:spPr>
|
|
229
|
+
<!-- NOTE: No <p:txBody> here - this is valid for image placeholders -->
|
|
230
|
+
</p:sp>
|
|
231
|
+
|
|
232
|
+
<!-- Another normal text shape - should also be extracted -->
|
|
233
|
+
<p:sp>
|
|
234
|
+
<p:nvSpPr>
|
|
235
|
+
<p:cNvPr id="3" name="Content"/>
|
|
236
|
+
<p:cNvSpPr/>
|
|
237
|
+
<p:nvPr/>
|
|
238
|
+
</p:nvSpPr>
|
|
239
|
+
<p:spPr>
|
|
240
|
+
<a:xfrm>
|
|
241
|
+
<a:off x="0" y="200000"/>
|
|
242
|
+
<a:ext cx="100000" cy="100000"/>
|
|
243
|
+
</a:xfrm>
|
|
244
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
245
|
+
</p:spPr>
|
|
246
|
+
<p:txBody>
|
|
247
|
+
<a:bodyPr/>
|
|
248
|
+
<a:lstStyle/>
|
|
249
|
+
<a:p>
|
|
250
|
+
<a:r>
|
|
251
|
+
<a:rPr lang="en-US"/>
|
|
252
|
+
<a:t>Content after image placeholder</a:t>
|
|
253
|
+
</a:r>
|
|
254
|
+
</a:p>
|
|
255
|
+
</p:txBody>
|
|
256
|
+
</p:sp>
|
|
257
|
+
|
|
258
|
+
</p:spTree>
|
|
259
|
+
</p:cSld>
|
|
260
|
+
</p:sld>"#,
|
|
261
|
+
)
|
|
262
|
+
.unwrap();
|
|
263
|
+
|
|
264
|
+
// Add ppt/slides/_rels/slide1.xml.rels (empty)
|
|
265
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
|
|
266
|
+
zip.write_all(
|
|
267
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
268
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
269
|
+
</Relationships>"#,
|
|
270
|
+
)
|
|
271
|
+
.unwrap();
|
|
272
|
+
|
|
273
|
+
zip.finish().unwrap();
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Extract the PPTX file
|
|
277
|
+
let result = extract_file(
|
|
278
|
+
temp_file.path(),
|
|
279
|
+
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
|
280
|
+
&ExtractionConfig::default(),
|
|
281
|
+
)
|
|
282
|
+
.await;
|
|
283
|
+
|
|
284
|
+
match result {
|
|
285
|
+
Ok(extraction) => {
|
|
286
|
+
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
287
|
+
|
|
288
|
+
// Verify we extracted text from shapes that DO have txBody
|
|
289
|
+
assert!(
|
|
290
|
+
extraction.content.contains("title text"),
|
|
291
|
+
"Should extract text from first shape with txBody. Got: {}",
|
|
292
|
+
extraction.content
|
|
293
|
+
);
|
|
294
|
+
assert!(
|
|
295
|
+
extraction.content.contains("Content after"),
|
|
296
|
+
"Should extract text from shape after image placeholder. Got: {}",
|
|
297
|
+
extraction.content
|
|
298
|
+
);
|
|
299
|
+
|
|
300
|
+
println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
|
|
301
|
+
println!(" Content: {}", extraction.content);
|
|
302
|
+
}
|
|
303
|
+
Err(e) => {
|
|
304
|
+
let error_msg = format!("{:?}", e);
|
|
305
|
+
if error_msg.contains("No txBody found") {
|
|
306
|
+
panic!(
|
|
307
|
+
"PPTX extraction failed with 'No txBody found' error!\n\
|
|
308
|
+
This is GitHub Issue #321 Bug 1.\n\
|
|
309
|
+
The parser should skip shapes without txBody (image placeholders) \
|
|
310
|
+
instead of failing.\n\
|
|
311
|
+
Error: {:?}",
|
|
312
|
+
e
|
|
313
|
+
);
|
|
314
|
+
} else {
|
|
315
|
+
panic!("PPTX extraction failed with unexpected error: {:?}", e);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/// Test extraction of PPTX with multiple shapes, some with txBody, some without.
|
|
322
|
+
///
|
|
323
|
+
/// This test verifies that:
|
|
324
|
+
/// 1. Shapes WITH txBody are extracted
|
|
325
|
+
/// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
|
|
326
|
+
/// 3. The extraction continues and doesn't fail on the first shape without txBody
|
|
327
|
+
///
|
|
328
|
+
/// GitHub Issue #321 Bug 1
|
|
329
|
+
#[tokio::test]
|
|
330
|
+
async fn test_pptx_mixed_shapes_extraction() {
|
|
331
|
+
// Create a PPTX with multiple slides, each containing mixed shapes
|
|
332
|
+
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
|
|
333
|
+
|
|
334
|
+
{
|
|
335
|
+
let mut zip = ZipWriter::new(&mut temp_file);
|
|
336
|
+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
337
|
+
|
|
338
|
+
// Add [Content_Types].xml
|
|
339
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
340
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
341
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
342
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
343
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
344
|
+
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
345
|
+
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
346
|
+
</Types>"#).unwrap();
|
|
347
|
+
|
|
348
|
+
// Add _rels/.rels
|
|
349
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
350
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
351
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
352
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
353
|
+
</Relationships>"#).unwrap();
|
|
354
|
+
|
|
355
|
+
// Add ppt/presentation.xml
|
|
356
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
357
|
+
zip.write_all(
|
|
358
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
359
|
+
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
360
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
361
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
|
|
362
|
+
<p:sldIdLst>
|
|
363
|
+
<p:sldId id="256" r:id="rId2"/>
|
|
364
|
+
</p:sldIdLst>
|
|
365
|
+
</p:presentation>"#,
|
|
366
|
+
)
|
|
367
|
+
.unwrap();
|
|
368
|
+
|
|
369
|
+
// Add ppt/_rels/presentation.xml.rels
|
|
370
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
371
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
372
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
373
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
374
|
+
</Relationships>"#).unwrap();
|
|
375
|
+
|
|
376
|
+
// Add slide with various shapes - some with txBody, some without
|
|
377
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
378
|
+
zip.write_all(
|
|
379
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
380
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
381
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
382
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
383
|
+
<p:cSld>
|
|
384
|
+
<p:spTree>
|
|
385
|
+
<p:nvGrpSpPr>
|
|
386
|
+
<p:cNvPr id="1" name=""/>
|
|
387
|
+
<p:cNvGrpSpPr/>
|
|
388
|
+
<p:nvPr/>
|
|
389
|
+
</p:nvGrpSpPr>
|
|
390
|
+
<p:grpSpPr/>
|
|
391
|
+
|
|
392
|
+
<!-- Shape 1: Normal text -->
|
|
393
|
+
<p:sp>
|
|
394
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
395
|
+
<p:spPr/>
|
|
396
|
+
<p:txBody>
|
|
397
|
+
<a:bodyPr/><a:lstStyle/>
|
|
398
|
+
<a:p><a:r><a:t>First Text Shape</a:t></a:r></a:p>
|
|
399
|
+
</p:txBody>
|
|
400
|
+
</p:sp>
|
|
401
|
+
|
|
402
|
+
<!-- Shape 2: Image placeholder (NO txBody) -->
|
|
403
|
+
<p:sp>
|
|
404
|
+
<p:nvSpPr>
|
|
405
|
+
<p:cNvPr id="10" name="Picture Placeholder"/>
|
|
406
|
+
<p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
|
|
407
|
+
<p:nvPr><p:ph type="pic"/></p:nvPr>
|
|
408
|
+
</p:nvSpPr>
|
|
409
|
+
<p:spPr/>
|
|
410
|
+
</p:sp>
|
|
411
|
+
|
|
412
|
+
<!-- Shape 3: Another text shape -->
|
|
413
|
+
<p:sp>
|
|
414
|
+
<p:nvSpPr><p:cNvPr id="3" name="Body"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
415
|
+
<p:spPr/>
|
|
416
|
+
<p:txBody>
|
|
417
|
+
<a:bodyPr/><a:lstStyle/>
|
|
418
|
+
<a:p><a:r><a:t>Second Text Shape</a:t></a:r></a:p>
|
|
419
|
+
</p:txBody>
|
|
420
|
+
</p:sp>
|
|
421
|
+
|
|
422
|
+
<!-- Shape 4: Chart placeholder (NO txBody) -->
|
|
423
|
+
<p:sp>
|
|
424
|
+
<p:nvSpPr>
|
|
425
|
+
<p:cNvPr id="11" name="Chart Placeholder"/>
|
|
426
|
+
<p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
|
|
427
|
+
<p:nvPr><p:ph type="chart"/></p:nvPr>
|
|
428
|
+
</p:nvSpPr>
|
|
429
|
+
<p:spPr/>
|
|
430
|
+
</p:sp>
|
|
431
|
+
|
|
432
|
+
<!-- Shape 5: Content placeholder (NO txBody - empty) -->
|
|
433
|
+
<p:sp>
|
|
434
|
+
<p:nvSpPr>
|
|
435
|
+
<p:cNvPr id="12" name="Content Placeholder"/>
|
|
436
|
+
<p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
|
|
437
|
+
<p:nvPr><p:ph type="body"/></p:nvPr>
|
|
438
|
+
</p:nvSpPr>
|
|
439
|
+
<p:spPr/>
|
|
440
|
+
</p:sp>
|
|
441
|
+
|
|
442
|
+
<!-- Shape 6: Final text shape -->
|
|
443
|
+
<p:sp>
|
|
444
|
+
<p:nvSpPr><p:cNvPr id="4" name="Footer"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
445
|
+
<p:spPr/>
|
|
446
|
+
<p:txBody>
|
|
447
|
+
<a:bodyPr/><a:lstStyle/>
|
|
448
|
+
<a:p><a:r><a:t>Third Text Shape</a:t></a:r></a:p>
|
|
449
|
+
</p:txBody>
|
|
450
|
+
</p:sp>
|
|
451
|
+
|
|
452
|
+
</p:spTree>
|
|
453
|
+
</p:cSld>
|
|
454
|
+
</p:sld>"#,
|
|
455
|
+
)
|
|
456
|
+
.unwrap();
|
|
457
|
+
|
|
458
|
+
// Add empty rels
|
|
459
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
|
|
460
|
+
zip.write_all(
|
|
461
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
462
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
463
|
+
</Relationships>"#,
|
|
464
|
+
)
|
|
465
|
+
.unwrap();
|
|
466
|
+
|
|
467
|
+
zip.finish().unwrap();
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
let result = extract_file(
|
|
471
|
+
temp_file.path(),
|
|
472
|
+
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
|
473
|
+
&ExtractionConfig::default(),
|
|
474
|
+
)
|
|
475
|
+
.await;
|
|
476
|
+
|
|
477
|
+
match result {
|
|
478
|
+
Ok(extraction) => {
|
|
479
|
+
// All three text shapes should be extracted
|
|
480
|
+
assert!(
|
|
481
|
+
extraction.content.contains("First Text Shape"),
|
|
482
|
+
"Should extract first text shape"
|
|
483
|
+
);
|
|
484
|
+
assert!(
|
|
485
|
+
extraction.content.contains("Second Text Shape"),
|
|
486
|
+
"Should extract second text shape (after image placeholder)"
|
|
487
|
+
);
|
|
488
|
+
assert!(
|
|
489
|
+
extraction.content.contains("Third Text Shape"),
|
|
490
|
+
"Should extract third text shape (after multiple placeholders)"
|
|
491
|
+
);
|
|
492
|
+
|
|
493
|
+
println!("✅ PPTX mixed shapes extraction succeeded!");
|
|
494
|
+
println!(" All text shapes extracted despite image/chart/content placeholders without txBody");
|
|
495
|
+
}
|
|
496
|
+
Err(e) => {
|
|
497
|
+
panic!(
|
|
498
|
+
"PPTX extraction failed: {:?}\n\
|
|
499
|
+
Shapes without txBody should be skipped gracefully.",
|
|
500
|
+
e
|
|
501
|
+
);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-tesseract"
|
|
3
|
-
version = "4.1.
|
|
3
|
+
version = "4.1.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -21,10 +21,10 @@ libc = { workspace = true }
|
|
|
21
21
|
thiserror = { workspace = true }
|
|
22
22
|
|
|
23
23
|
[dev-dependencies]
|
|
24
|
-
image = { workspace = true }
|
|
24
|
+
image = { workspace = true, features = ["png"] }
|
|
25
25
|
|
|
26
26
|
[build-dependencies]
|
|
27
|
-
cc = { version = "^1.2.
|
|
27
|
+
cc = { version = "^1.2.54", optional = true }
|
|
28
28
|
cmake = { version = "0.1.57", optional = true }
|
|
29
29
|
zip = { version = "7.2.0", optional = true }
|
|
30
30
|
|
|
@@ -38,7 +38,7 @@ mod build_tesseract {
|
|
|
38
38
|
return None;
|
|
39
39
|
}
|
|
40
40
|
}
|
|
41
|
-
Some(path.join("tesseract-
|
|
41
|
+
Some(path.join("kreuzberg-tesseract-cache"))
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
fn get_preferred_out_dir() -> PathBuf {
|
|
@@ -63,14 +63,14 @@ mod build_tesseract {
|
|
|
63
63
|
PathBuf::from(home_dir)
|
|
64
64
|
.join("Library")
|
|
65
65
|
.join("Application Support")
|
|
66
|
-
.join("tesseract
|
|
66
|
+
.join("kreuzberg-tesseract")
|
|
67
67
|
} else if cfg!(target_os = "linux") {
|
|
68
68
|
let home_dir = env::var("HOME").unwrap_or_else(|_| {
|
|
69
69
|
env::var("USER")
|
|
70
70
|
.map(|user| format!("/home/{}", user))
|
|
71
71
|
.expect("Neither HOME nor USER environment variable set")
|
|
72
72
|
});
|
|
73
|
-
PathBuf::from(home_dir).join(".tesseract
|
|
73
|
+
PathBuf::from(home_dir).join(".kreuzberg-tesseract")
|
|
74
74
|
} else {
|
|
75
75
|
panic!("Unsupported operating system");
|
|
76
76
|
}
|
|
@@ -117,7 +117,7 @@ mod build_tesseract {
|
|
|
117
117
|
"cargo:warning=Failed to create cache dir {:?}: {}. Falling back to temp dir.",
|
|
118
118
|
preferred, err
|
|
119
119
|
);
|
|
120
|
-
let fallback = env::temp_dir().join("tesseract-
|
|
120
|
+
let fallback = env::temp_dir().join("kreuzberg-tesseract-cache");
|
|
121
121
|
fs::create_dir_all(&fallback).expect("Failed to create fallback cache directory in temp dir");
|
|
122
122
|
fallback
|
|
123
123
|
}
|
|
@@ -6,16 +6,16 @@
|
|
|
6
6
|
#![allow(clippy::not_unsafe_ptr_arg_deref)]
|
|
7
7
|
#![allow(clippy::cmp_null)]
|
|
8
8
|
|
|
9
|
-
//! # tesseract
|
|
9
|
+
//! # kreuzberg-tesseract
|
|
10
10
|
//!
|
|
11
|
-
//! `tesseract
|
|
11
|
+
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
|
|
12
12
|
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
|
|
13
13
|
//! easily accessible in Rust projects while handling the complexity of interfacing
|
|
14
14
|
//! with the underlying C++ libraries.
|
|
15
15
|
//!
|
|
16
16
|
//! ## Usage
|
|
17
17
|
//!
|
|
18
|
-
//! Here's a basic example of how to use `tesseract
|
|
18
|
+
//! Here's a basic example of how to use `kreuzberg-tesseract`:
|
|
19
19
|
//!
|
|
20
20
|
//! ```rust
|
|
21
21
|
//! use std::path::PathBuf;
|
|
@@ -28,16 +28,16 @@
|
|
|
28
28
|
//! PathBuf::from(home_dir)
|
|
29
29
|
//! .join("Library")
|
|
30
30
|
//! .join("Application Support")
|
|
31
|
-
//! .join("tesseract
|
|
31
|
+
//! .join("kreuzberg-tesseract")
|
|
32
32
|
//! .join("tessdata")
|
|
33
33
|
//! } else if cfg!(target_os = "linux") {
|
|
34
34
|
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
|
35
35
|
//! PathBuf::from(home_dir)
|
|
36
|
-
//! .join(".tesseract
|
|
36
|
+
//! .join(".kreuzberg-tesseract")
|
|
37
37
|
//! .join("tessdata")
|
|
38
38
|
//! } else if cfg!(target_os = "windows") {
|
|
39
39
|
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
|
40
|
-
//! .join("tesseract
|
|
40
|
+
//! .join("kreuzberg-tesseract")
|
|
41
41
|
//! .join("tessdata")
|
|
42
42
|
//! } else {
|
|
43
43
|
//! panic!("Unsupported operating system");
|
|
@@ -7,7 +7,7 @@ fn get_default_tessdata_dir() -> PathBuf {
|
|
|
7
7
|
PathBuf::from(home_dir)
|
|
8
8
|
.join("Library")
|
|
9
9
|
.join("Application Support")
|
|
10
|
-
.join("tesseract
|
|
10
|
+
.join("kreuzberg-tesseract")
|
|
11
11
|
.join("tessdata")
|
|
12
12
|
} else if cfg!(target_os = "linux") {
|
|
13
13
|
let system_paths = [
|
|
@@ -20,10 +20,10 @@ fn get_default_tessdata_dir() -> PathBuf {
|
|
|
20
20
|
}
|
|
21
21
|
}
|
|
22
22
|
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
|
23
|
-
PathBuf::from(home_dir).join(".tesseract
|
|
23
|
+
PathBuf::from(home_dir).join(".kreuzberg-tesseract").join("tessdata")
|
|
24
24
|
} else if cfg!(target_os = "windows") {
|
|
25
25
|
PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
|
26
|
-
.join("tesseract
|
|
26
|
+
.join("kreuzberg-tesseract")
|
|
27
27
|
.join("tessdata")
|
|
28
28
|
} else {
|
|
29
29
|
panic!("Unsupported operating system");
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.1.
|
|
4
|
+
version: 4.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -761,6 +761,7 @@ files:
|
|
|
761
761
|
- vendor/kreuzberg/tests/plugin_postprocessor_test.rs
|
|
762
762
|
- vendor/kreuzberg/tests/plugin_system.rs
|
|
763
763
|
- vendor/kreuzberg/tests/plugin_validator_test.rs
|
|
764
|
+
- vendor/kreuzberg/tests/pptx_regression_tests.rs
|
|
764
765
|
- vendor/kreuzberg/tests/registry_integration_tests.rs
|
|
765
766
|
- vendor/kreuzberg/tests/rst_extractor_tests.rs
|
|
766
767
|
- vendor/kreuzberg/tests/rtf_extractor_tests.rs
|
|
@@ -790,6 +791,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
790
791
|
- - ">="
|
|
791
792
|
- !ruby/object:Gem::Version
|
|
792
793
|
version: 3.2.0
|
|
794
|
+
- - "<"
|
|
795
|
+
- !ruby/object:Gem::Version
|
|
796
|
+
version: '5.0'
|
|
793
797
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
794
798
|
requirements:
|
|
795
799
|
- - ">="
|