kreuzberg 4.1.0 → 4.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: df75a873a0c3547b2e6e44fa1c04e939372f5c01839c1d81ea671414e5c98016
4
- data.tar.gz: e4f60f88486c2807c6bf45959e6e6160d509ed096236a5d5bf0c70c26cf8f2f8
3
+ metadata.gz: 6550daabf58e5e396576e5a83c6a53f226e677f9c129920c9990bba309fbd7ba
4
+ data.tar.gz: 9595aa468666391d08a0962db589bbbc50d5bd1c8532e101efa234f6c523d7c5
5
5
  SHA512:
6
- metadata.gz: fac3f1dc6ca132f71f8034536f3ca7b7df53542a760061932e447e5cde3906d61903bb9ade16e96b782de1b879be68c50ee59218c3b0ba908d26d515d90d4966
7
- data.tar.gz: 79b7ab92c373b7a06d06fe324b3d06ad2a588c9e993a2c8cb23c474e3452d82914d2ac48eb9edc1b9475cbf337b4e09a160c242d3510b33364d7f511d264e36f
6
+ metadata.gz: 0dea911deebe061515dd4cbff2b76b3a7947c68f196fcc576001d42d80386f6c53f8ed63e0e4acb8e719ad6f95c21e689df7aef5f6cbbbc0d1c92ef96ddb673c
7
+ data.tar.gz: 0df091f80f7c73dda0c17d89d4aa0571cd01f0f2b697b187fd9bae28f8dbcf96cd2e3a269f9831a442b8cf46ce40608586d3d6a242d84bb394fe6056cba3b492
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.1.0)
4
+ kreuzberg (4.1.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -121,7 +121,7 @@ GEM
121
121
  rubocop (~> 1.81)
122
122
  ruby-progressbar (1.13.0)
123
123
  securerandom (0.4.1)
124
- sorbet-runtime (0.6.12894)
124
+ sorbet-runtime (0.6.12897)
125
125
  steep (1.10.0)
126
126
  activesupport (>= 5.1)
127
127
  concurrent-ruby (>= 1.1.10)
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.1.0)
210
+ kreuzberg (4.1.2)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -242,7 +242,7 @@ CHECKSUMS
242
242
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
243
243
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
244
244
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
245
- sorbet-runtime (0.6.12894) sha256=4f0cbe041d80dac973ec3a5a848679922074dd77cc19f46384b27a8b9ff4a90c
245
+ sorbet-runtime (0.6.12897) sha256=0348ab8803c4c3646977fee298083ded9b7e74d5b34b50c567c63eb7e36eb286
246
246
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
247
247
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
248
248
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -86,10 +86,13 @@ gem 'kreuzberg'
86
86
 
87
87
  ### System Requirements
88
88
 
89
- - **Ruby 2.7+** required
89
+ - **Ruby 3.2.0 or higher** required (including Ruby 4.x)
90
+ - Ruby 4.0+ is fully supported with no code changes required
90
91
  - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
91
92
  - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
92
93
 
94
+ **Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
95
+
93
96
 
94
97
 
95
98
  ## Quick Start
@@ -202,9 +205,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
202
205
 
203
206
  ## Features
204
207
 
205
- ### Supported File Formats (56+)
208
+ ### Supported File Formats (57+)
206
209
 
207
- 56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
210
+ 57 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
208
211
 
209
212
  #### Office Documents
210
213
 
@@ -230,7 +233,7 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
230
233
  |----------|---------|----------|
231
234
  | **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
232
235
  | **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
233
- | **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
236
+ | **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
234
237
 
235
238
  #### Email & Archives
236
239
 
@@ -31,7 +31,7 @@ embeddings = ["kreuzberg/embeddings"]
31
31
 
32
32
  [dependencies]
33
33
  async-trait = "0.1.89"
34
- kreuzberg = { path = "../../../../../crates/kreuzberg", default-features = false, features = [
34
+ kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, features = [
35
35
  "pdf",
36
36
  "excel",
37
37
  "office",
@@ -51,7 +51,7 @@ kreuzberg = { path = "../../../../../crates/kreuzberg", default-features = false
51
51
  "bundled-pdfium",
52
52
  "tokio-runtime",
53
53
  ] }
54
- kreuzberg-ffi = { path = "../../../../../crates/kreuzberg-ffi" }
54
+ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
55
55
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
56
56
  "rb-sys",
57
57
  ] }
@@ -1025,8 +1025,10 @@ pub fn config_from_file(path: String) -> Result<RHash, Error> {
1025
1025
  .and_then(|v| magnus::RHash::try_convert(v).map_err(|_| validation_error("Config must be a Hash")))
1026
1026
  }
1027
1027
 
1028
- /// Discover extraction config from current directory
1028
+ /// Discover extraction config from current directory or parent directories
1029
1029
  pub fn config_discover() -> Result<Value, Error> {
1030
+ use std::path::PathBuf;
1031
+
1030
1032
  let ruby = Ruby::get().expect("Ruby not initialized");
1031
1033
 
1032
1034
  // Search for config files in order of precedence
@@ -1038,19 +1040,27 @@ pub fn config_discover() -> Result<Value, Error> {
1038
1040
  (".kreuzbergrc", "json"),
1039
1041
  ];
1040
1042
 
1041
- for (name, format) in config_files {
1042
- if let Ok(content) = fs::read_to_string(name) {
1043
- let json_value: serde_json::Value = match format {
1044
- "toml" => toml::from_str(&content)
1045
- .map_err(|e| validation_error(format!("Invalid TOML in {}: {}", name, e)))?,
1046
- "yaml" => serde_yaml_ng::from_str(&content)
1047
- .map_err(|e| validation_error(format!("Invalid YAML in {}: {}", name, e)))?,
1048
- "json" => serde_json::from_str(&content)
1049
- .map_err(|e| validation_error(format!("Invalid JSON in {}: {}", name, e)))?,
1050
- _ => unreachable!(),
1051
- };
1052
- return json_value_to_ruby(&ruby, &json_value);
1043
+ // Start from current directory and search up to parent directories
1044
+ let mut current_dir: Option<PathBuf> = std::env::current_dir().ok();
1045
+
1046
+ while let Some(dir) = current_dir {
1047
+ for (name, format) in &config_files {
1048
+ let config_path = dir.join(name);
1049
+ if let Ok(content) = fs::read_to_string(&config_path) {
1050
+ let json_value: serde_json::Value = match *format {
1051
+ "toml" => toml::from_str(&content)
1052
+ .map_err(|e| validation_error(format!("Invalid TOML in {}: {}", config_path.display(), e)))?,
1053
+ "yaml" => serde_yaml_ng::from_str(&content)
1054
+ .map_err(|e| validation_error(format!("Invalid YAML in {}: {}", config_path.display(), e)))?,
1055
+ "json" => serde_json::from_str(&content)
1056
+ .map_err(|e| validation_error(format!("Invalid JSON in {}: {}", config_path.display(), e)))?,
1057
+ _ => unreachable!(),
1058
+ };
1059
+ return json_value_to_ruby(&ruby, &json_value);
1060
+ }
1053
1061
  }
1062
+ // Move to parent directory
1063
+ current_dir = dir.parent().map(|p| p.to_path_buf());
1054
1064
  }
1055
1065
 
1056
1066
  // Return nil if no config found
data/kreuzberg.gemspec CHANGED
@@ -165,7 +165,7 @@ Gem::Specification.new do |spec|
165
165
  DESC
166
166
  spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
167
167
  spec.license = 'MIT'
168
- spec.required_ruby_version = '>= 3.2.0'
168
+ spec.required_ruby_version = '>= 3.2.0', '< 5.0'
169
169
 
170
170
  spec.metadata = {
171
171
  'homepage_uri' => spec.homepage,
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'open3'
4
- require 'pathname'
5
4
 
6
5
  module Kreuzberg
7
6
  # @example Start the server
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'open3'
4
- require 'pathname'
5
4
 
6
5
  module Kreuzberg
7
6
  # @example
@@ -733,11 +733,42 @@ module Kreuzberg
733
733
  # @example Load from YAML
734
734
  # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
735
735
  #
736
+ # Keys that are allowed in the Extraction config
737
+ ALLOWED_KEYS = %i[
738
+ use_cache enable_quality_processing force_ocr ocr chunking
739
+ language_detection pdf_options image_extraction image_preprocessing
740
+ postprocessor token_reduction keywords html_options pages
741
+ max_concurrent_extractions
742
+ ].freeze
743
+
744
+ # Aliases for backward compatibility
745
+ KEY_ALIASES = {
746
+ images: :image_extraction
747
+ }.freeze
748
+
736
749
  def self.from_file(path)
737
750
  hash = Kreuzberg._config_from_file_native(path)
738
- new(**hash.transform_keys(&:to_sym))
751
+ new(**normalize_hash_keys(hash))
739
752
  end
740
753
 
754
+ # Normalize hash keys from native function
755
+ # - Converts string keys to symbols
756
+ # - Maps aliased keys to their canonical names
757
+ # - Filters out unknown keys
758
+ def self.normalize_hash_keys(hash)
759
+ symbolized = hash.transform_keys(&:to_sym)
760
+
761
+ # Apply key aliases
762
+ KEY_ALIASES.each do |from, to|
763
+ symbolized[to] = symbolized.delete(from) if symbolized.key?(from) && !symbolized.key?(to)
764
+ end
765
+
766
+ # Filter to only allowed keys
767
+ symbolized.slice(*ALLOWED_KEYS)
768
+ end
769
+
770
+ private_class_method :normalize_hash_keys
771
+
741
772
  # Discover configuration file in current or parent directories.
742
773
  #
743
774
  # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
@@ -755,7 +786,7 @@ module Kreuzberg
755
786
  hash = Kreuzberg._config_discover_native
756
787
  return nil if hash.nil?
757
788
 
758
- new(**hash.transform_keys(&:to_sym))
789
+ new(**normalize_hash_keys(hash))
759
790
  end
760
791
 
761
792
  def initialize(
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'open3'
4
- require 'pathname'
5
4
  require 'json'
6
5
 
7
6
  module Kreuzberg
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.1.0'
4
+ VERSION = '4.1.2'
5
5
  end
@@ -21,7 +21,7 @@ extract_images = true
21
21
  passwords = ["secret", "backup"]
22
22
  extract_metadata = true
23
23
 
24
- [images]
24
+ [image_extraction]
25
25
  extract_images = true
26
26
  target_dpi = 600
27
27
  max_image_dimension = 2000
@@ -23,7 +23,7 @@ pdf_options:
23
23
  - password2
24
24
  extract_metadata: true
25
25
 
26
- images:
26
+ image_extraction:
27
27
  extract_images: true
28
28
  target_dpi: 300
29
29
  max_image_dimension: 4096
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.1.0"
6
+ version = "4.1.2"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.1.0"
3
+ version = "4.1.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -142,7 +142,7 @@ mime_guess = "2.0"
142
142
  rmp-serde = "1.3"
143
143
  thiserror = { workspace = true }
144
144
  tokio = { workspace = true, optional = true }
145
- uuid = { version = "1.19.0", features = ["v4", "js"] }
145
+ uuid = { version = "1.20.0", features = ["v4", "js"] }
146
146
  indexmap = "2.13.0"
147
147
  tracing = { workspace = true }
148
148
  pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
@@ -174,7 +174,7 @@ rst_parser = { version = "0.4", optional = true }
174
174
  fb2 = { version = "0.4", optional = true }
175
175
  typst-syntax = { version = "0.14", optional = true }
176
176
 
177
- kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
177
+ kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "4.1", optional = true }
178
178
  image = { workspace = true, default-features = false, features = [
179
179
  "png",
180
180
  "jpeg",
@@ -198,7 +198,7 @@ rake = { version = "0.3.6", optional = true }
198
198
  axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
199
199
  tower = { version = "0.5", optional = true }
200
200
  tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
201
- rmcp = { version = "0.13.0", features = [
201
+ rmcp = { version = "0.14.0", features = [
202
202
  "server",
203
203
  "macros",
204
204
  "base64",
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.1.0 Release**
20
+ > **🚀 Version 4.1.2 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -57,6 +57,11 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
57
57
  m.insert("ods", OPENDOC_SPREADSHEET_MIME_TYPE);
58
58
 
59
59
  m.insert("pptx", POWER_POINT_MIME_TYPE);
60
+ m.insert(
61
+ "ppsx",
62
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
63
+ );
64
+ m.insert("pptm", "application/vnd.ms-powerpoint.presentation.macroEnabled.12");
60
65
  m.insert("ppt", LEGACY_POWERPOINT_MIME_TYPE);
61
66
 
62
67
  m.insert("docx", DOCX_MIME_TYPE);
@@ -180,6 +185,8 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
180
185
 
181
186
  set.insert(PDF_MIME_TYPE);
182
187
  set.insert(POWER_POINT_MIME_TYPE);
188
+ set.insert("application/vnd.openxmlformats-officedocument.presentationml.slideshow"); // PPSX
189
+ set.insert("application/vnd.ms-powerpoint.presentation.macroEnabled.12"); // PPTM
183
190
  set.insert(LEGACY_WORD_MIME_TYPE);
184
191
  set.insert(LEGACY_POWERPOINT_MIME_TYPE);
185
192
  set.insert(HTML_MIME_TYPE);
@@ -459,6 +466,14 @@ mod tests {
459
466
  ("test.xlsx", EXCEL_MIME_TYPE),
460
467
  ("test.xls", EXCEL_BINARY_MIME_TYPE),
461
468
  ("test.pptx", POWER_POINT_MIME_TYPE),
469
+ (
470
+ "test.ppsx",
471
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
472
+ ),
473
+ (
474
+ "test.pptm",
475
+ "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
476
+ ),
462
477
  ("test.ppt", LEGACY_POWERPOINT_MIME_TYPE),
463
478
  ("test.docx", DOCX_MIME_TYPE),
464
479
  ("test.doc", LEGACY_WORD_MIME_TYPE),
@@ -60,9 +60,12 @@ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
60
60
  match tag_name {
61
61
  "sp" => {
62
62
  let position = extract_position(node);
63
- match parse_sp(node)? {
64
- ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
65
- ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
63
+ // parse_sp returns None for shapes without txBody (e.g., image placeholders)
64
+ if let Some(content) = parse_sp(node)? {
65
+ match content {
66
+ ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
67
+ ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
68
+ }
66
69
  }
67
70
  }
68
71
  "graphicFrame" => {
@@ -85,11 +88,17 @@ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
85
88
  Ok(elements)
86
89
  }
87
90
 
88
- fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
89
- let tx_body_node = sp_node
91
+ fn parse_sp(sp_node: &Node) -> Result<Option<ParsedContent>> {
92
+ // Some shapes like image placeholders (<p:ph type="pic"/>) don't have txBody.
93
+ // These should be skipped gracefully - they contain no text to extract.
94
+ // GitHub Issue #321 Bug 1
95
+ let tx_body_node = match sp_node
90
96
  .children()
91
97
  .find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(P_NAMESPACE))
92
- .ok_or_else(|| KreuzbergError::parsing("No txBody found".to_string()))?;
98
+ {
99
+ Some(node) => node,
100
+ None => return Ok(None), // Skip shapes without txBody
101
+ };
93
102
 
94
103
  let is_list = tx_body_node.descendants().any(|n| {
95
104
  n.is_element()
@@ -103,9 +112,9 @@ fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
103
112
  });
104
113
 
105
114
  if is_list {
106
- Ok(ParsedContent::List(parse_list(&tx_body_node)?))
115
+ Ok(Some(ParsedContent::List(parse_list(&tx_body_node)?)))
107
116
  } else {
108
- Ok(ParsedContent::Text(parse_text(&tx_body_node)?))
117
+ Ok(Some(ParsedContent::Text(parse_text(&tx_body_node)?)))
109
118
  }
110
119
  }
111
120
 
@@ -0,0 +1,504 @@
1
+ //! Regression tests for PPTX/PPSX extraction bugs
2
+ //!
3
+ //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
4
+ //!
5
+ //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
6
+ //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
7
+
8
+ #![cfg(feature = "office")]
9
+
10
+ use kreuzberg::{ExtractionConfig, extract_file};
11
+ use std::io::Write;
12
+ use tempfile::NamedTempFile;
13
+ use zip::CompressionMethod;
14
+ use zip::write::{FileOptions, ZipWriter};
15
+
16
+ /// Test that PPSX (PowerPoint Show) files are extracted correctly.
17
+ ///
18
+ /// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
19
+ /// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
20
+ ///
21
+ /// The internal structure is identical to PPTX - same slide XML format.
22
+ ///
23
+ /// GitHub Issue #321 Bug 2
24
+ #[tokio::test]
25
+ async fn test_ppsx_slideshow_extraction() {
26
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
27
+ .parent()
28
+ .unwrap()
29
+ .parent()
30
+ .unwrap();
31
+ let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
32
+
33
+ if !test_file.exists() {
34
+ println!("Skipping test: PPSX test file not found at {:?}", test_file);
35
+ return;
36
+ }
37
+
38
+ let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;
39
+
40
+ match result {
41
+ Ok(extraction) => {
42
+ assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
43
+ println!("✅ PPSX extraction succeeded!");
44
+ println!(" Content length: {} chars", extraction.content.len());
45
+ println!(
46
+ " Content preview: {}",
47
+ &extraction.content[..extraction.content.len().min(200)]
48
+ );
49
+ }
50
+ Err(e) => {
51
+ panic!(
52
+ "PPSX extraction failed with error: {:?}\n\
53
+ This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
54
+ PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
55
+ needs to be added to extension-to-MIME mapping.",
56
+ e
57
+ );
58
+ }
59
+ }
60
+ }
61
+
62
+ /// Test that PPSX files can be extracted when MIME type is explicitly provided.
63
+ ///
64
+ /// This validates that the PPTX extractor can handle PPSX content correctly
65
+ /// (the XML structure is identical), even if MIME detection fails.
66
+ ///
67
+ /// GitHub Issue #321 Bug 2
68
+ #[tokio::test]
69
+ async fn test_ppsx_with_explicit_mime_type() {
70
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
71
+ .parent()
72
+ .unwrap()
73
+ .parent()
74
+ .unwrap();
75
+ let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
76
+
77
+ if !test_file.exists() {
78
+ println!("Skipping test: PPSX test file not found at {:?}", test_file);
79
+ return;
80
+ }
81
+
82
+ // Explicitly provide the PPSX MIME type
83
+ let result = extract_file(
84
+ &test_file,
85
+ Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
86
+ &ExtractionConfig::default(),
87
+ )
88
+ .await;
89
+
90
+ match result {
91
+ Ok(extraction) => {
92
+ assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
93
+ println!("✅ PPSX extraction with explicit MIME type succeeded!");
94
+ }
95
+ Err(e) => {
96
+ panic!(
97
+ "PPSX extraction with explicit MIME type failed: {:?}\n\
98
+ The PPTX extractor should handle PPSX content (identical XML structure).",
99
+ e
100
+ );
101
+ }
102
+ }
103
+ }
104
+
105
+ /// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
106
+ ///
107
+ /// Some shapes in PPTX files, like image placeholders (`<p:ph type="pic"/>`), don't have
108
+ /// `<p:txBody>` children because they're designed to hold images, not text.
109
+ ///
110
+ /// The parser should skip shapes without txBody gracefully instead of failing.
111
+ ///
112
+ /// GitHub Issue #321 Bug 1
113
+ #[tokio::test]
114
+ async fn test_pptx_with_image_placeholder_no_txbody() {
115
+ // Create a minimal PPTX with a shape that has no txBody (image placeholder)
116
+ let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
117
+
118
+ {
119
+ let mut zip = ZipWriter::new(&mut temp_file);
120
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
121
+
122
+ // Add [Content_Types].xml
123
+ zip.start_file("[Content_Types].xml", options).unwrap();
124
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
125
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
126
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
127
+ <Default Extension="xml" ContentType="application/xml"/>
128
+ <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
129
+ <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
130
+ </Types>"#).unwrap();
131
+
132
+ // Add _rels/.rels
133
+ zip.start_file("_rels/.rels", options).unwrap();
134
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
135
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
136
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
137
+ </Relationships>"#).unwrap();
138
+
139
+ // Add ppt/presentation.xml
140
+ zip.start_file("ppt/presentation.xml", options).unwrap();
141
+ zip.write_all(
142
+ br#"<?xml version="1.0" encoding="UTF-8"?>
143
+ <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
144
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
145
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
146
+ <p:sldIdLst>
147
+ <p:sldId id="256" r:id="rId2"/>
148
+ </p:sldIdLst>
149
+ </p:presentation>"#,
150
+ )
151
+ .unwrap();
152
+
153
+ // Add ppt/_rels/presentation.xml.rels
154
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
155
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
156
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
157
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
158
+ </Relationships>"#).unwrap();
159
+
160
+ // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
161
+ // This is the critical test case - a <p:sp> element with no <p:txBody>
162
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
163
+ zip.write_all(
164
+ br#"<?xml version="1.0" encoding="UTF-8"?>
165
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
166
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
167
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
168
+ <p:cSld>
169
+ <p:spTree>
170
+ <p:nvGrpSpPr>
171
+ <p:cNvPr id="1" name=""/>
172
+ <p:cNvGrpSpPr/>
173
+ <p:nvPr/>
174
+ </p:nvGrpSpPr>
175
+ <p:grpSpPr>
176
+ <a:xfrm>
177
+ <a:off x="0" y="0"/>
178
+ <a:ext cx="0" cy="0"/>
179
+ <a:chOff x="0" y="0"/>
180
+ <a:chExt cx="0" cy="0"/>
181
+ </a:xfrm>
182
+ </p:grpSpPr>
183
+
184
+ <!-- Normal text shape WITH txBody - this should be extracted -->
185
+ <p:sp>
186
+ <p:nvSpPr>
187
+ <p:cNvPr id="2" name="Title"/>
188
+ <p:cNvSpPr/>
189
+ <p:nvPr/>
190
+ </p:nvSpPr>
191
+ <p:spPr>
192
+ <a:xfrm>
193
+ <a:off x="0" y="0"/>
194
+ <a:ext cx="100000" cy="100000"/>
195
+ </a:xfrm>
196
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
197
+ </p:spPr>
198
+ <p:txBody>
199
+ <a:bodyPr/>
200
+ <a:lstStyle/>
201
+ <a:p>
202
+ <a:r>
203
+ <a:rPr lang="en-US"/>
204
+ <a:t>This is the title text</a:t>
205
+ </a:r>
206
+ </a:p>
207
+ </p:txBody>
208
+ </p:sp>
209
+
210
+ <!-- IMAGE PLACEHOLDER shape WITHOUT txBody - this caused the "No txBody found" error -->
211
+ <!-- This is a valid PPTX structure - image placeholders don't contain text -->
212
+ <p:sp>
213
+ <p:nvSpPr>
214
+ <p:cNvPr id="99" name="Image Placeholder"/>
215
+ <p:cNvSpPr>
216
+ <a:spLocks noGrp="1"/>
217
+ </p:cNvSpPr>
218
+ <p:nvPr>
219
+ <p:ph type="pic" idx="1"/>
220
+ </p:nvPr>
221
+ </p:nvSpPr>
222
+ <p:spPr>
223
+ <a:xfrm>
224
+ <a:off x="0" y="0"/>
225
+ <a:ext cx="100000" cy="100000"/>
226
+ </a:xfrm>
227
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
228
+ </p:spPr>
229
+ <!-- NOTE: No <p:txBody> here - this is valid for image placeholders -->
230
+ </p:sp>
231
+
232
+ <!-- Another normal text shape - should also be extracted -->
233
+ <p:sp>
234
+ <p:nvSpPr>
235
+ <p:cNvPr id="3" name="Content"/>
236
+ <p:cNvSpPr/>
237
+ <p:nvPr/>
238
+ </p:nvSpPr>
239
+ <p:spPr>
240
+ <a:xfrm>
241
+ <a:off x="0" y="200000"/>
242
+ <a:ext cx="100000" cy="100000"/>
243
+ </a:xfrm>
244
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
245
+ </p:spPr>
246
+ <p:txBody>
247
+ <a:bodyPr/>
248
+ <a:lstStyle/>
249
+ <a:p>
250
+ <a:r>
251
+ <a:rPr lang="en-US"/>
252
+ <a:t>Content after image placeholder</a:t>
253
+ </a:r>
254
+ </a:p>
255
+ </p:txBody>
256
+ </p:sp>
257
+
258
+ </p:spTree>
259
+ </p:cSld>
260
+ </p:sld>"#,
261
+ )
262
+ .unwrap();
263
+
264
+ // Add ppt/slides/_rels/slide1.xml.rels (empty)
265
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
266
+ zip.write_all(
267
+ br#"<?xml version="1.0" encoding="UTF-8"?>
268
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
269
+ </Relationships>"#,
270
+ )
271
+ .unwrap();
272
+
273
+ zip.finish().unwrap();
274
+ }
275
+
276
+ // Extract the PPTX file
277
+ let result = extract_file(
278
+ temp_file.path(),
279
+ Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
280
+ &ExtractionConfig::default(),
281
+ )
282
+ .await;
283
+
284
+ match result {
285
+ Ok(extraction) => {
286
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
287
+
288
+ // Verify we extracted text from shapes that DO have txBody
289
+ assert!(
290
+ extraction.content.contains("title text"),
291
+ "Should extract text from first shape with txBody. Got: {}",
292
+ extraction.content
293
+ );
294
+ assert!(
295
+ extraction.content.contains("Content after"),
296
+ "Should extract text from shape after image placeholder. Got: {}",
297
+ extraction.content
298
+ );
299
+
300
+ println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
301
+ println!(" Content: {}", extraction.content);
302
+ }
303
+ Err(e) => {
304
+ let error_msg = format!("{:?}", e);
305
+ if error_msg.contains("No txBody found") {
306
+ panic!(
307
+ "PPTX extraction failed with 'No txBody found' error!\n\
308
+ This is GitHub Issue #321 Bug 1.\n\
309
+ The parser should skip shapes without txBody (image placeholders) \
310
+ instead of failing.\n\
311
+ Error: {:?}",
312
+ e
313
+ );
314
+ } else {
315
+ panic!("PPTX extraction failed with unexpected error: {:?}", e);
316
+ }
317
+ }
318
+ }
319
+ }
320
+
321
+ /// Test extraction of PPTX with multiple shapes, some with txBody, some without.
322
+ ///
323
+ /// This test verifies that:
324
+ /// 1. Shapes WITH txBody are extracted
325
+ /// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
326
+ /// 3. The extraction continues and doesn't fail on the first shape without txBody
327
+ ///
328
+ /// GitHub Issue #321 Bug 1
329
+ #[tokio::test]
330
+ async fn test_pptx_mixed_shapes_extraction() {
331
+ // Create a PPTX with multiple slides, each containing mixed shapes
332
+ let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
333
+
334
+ {
335
+ let mut zip = ZipWriter::new(&mut temp_file);
336
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
337
+
338
+ // Add [Content_Types].xml
339
+ zip.start_file("[Content_Types].xml", options).unwrap();
340
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
341
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
342
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
343
+ <Default Extension="xml" ContentType="application/xml"/>
344
+ <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
345
+ <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
346
+ </Types>"#).unwrap();
347
+
348
+ // Add _rels/.rels
349
+ zip.start_file("_rels/.rels", options).unwrap();
350
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
351
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
352
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
353
+ </Relationships>"#).unwrap();
354
+
355
+ // Add ppt/presentation.xml
356
+ zip.start_file("ppt/presentation.xml", options).unwrap();
357
+ zip.write_all(
358
+ br#"<?xml version="1.0" encoding="UTF-8"?>
359
+ <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
360
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
361
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
362
+ <p:sldIdLst>
363
+ <p:sldId id="256" r:id="rId2"/>
364
+ </p:sldIdLst>
365
+ </p:presentation>"#,
366
+ )
367
+ .unwrap();
368
+
369
+ // Add ppt/_rels/presentation.xml.rels
370
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
371
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
372
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
373
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
374
+ </Relationships>"#).unwrap();
375
+
376
+ // Add slide with various shapes - some with txBody, some without
377
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
378
+ zip.write_all(
379
+ br#"<?xml version="1.0" encoding="UTF-8"?>
380
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
381
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
382
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
383
+ <p:cSld>
384
+ <p:spTree>
385
+ <p:nvGrpSpPr>
386
+ <p:cNvPr id="1" name=""/>
387
+ <p:cNvGrpSpPr/>
388
+ <p:nvPr/>
389
+ </p:nvGrpSpPr>
390
+ <p:grpSpPr/>
391
+
392
+ <!-- Shape 1: Normal text -->
393
+ <p:sp>
394
+ <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
395
+ <p:spPr/>
396
+ <p:txBody>
397
+ <a:bodyPr/><a:lstStyle/>
398
+ <a:p><a:r><a:t>First Text Shape</a:t></a:r></a:p>
399
+ </p:txBody>
400
+ </p:sp>
401
+
402
+ <!-- Shape 2: Image placeholder (NO txBody) -->
403
+ <p:sp>
404
+ <p:nvSpPr>
405
+ <p:cNvPr id="10" name="Picture Placeholder"/>
406
+ <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
407
+ <p:nvPr><p:ph type="pic"/></p:nvPr>
408
+ </p:nvSpPr>
409
+ <p:spPr/>
410
+ </p:sp>
411
+
412
+ <!-- Shape 3: Another text shape -->
413
+ <p:sp>
414
+ <p:nvSpPr><p:cNvPr id="3" name="Body"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
415
+ <p:spPr/>
416
+ <p:txBody>
417
+ <a:bodyPr/><a:lstStyle/>
418
+ <a:p><a:r><a:t>Second Text Shape</a:t></a:r></a:p>
419
+ </p:txBody>
420
+ </p:sp>
421
+
422
+ <!-- Shape 4: Chart placeholder (NO txBody) -->
423
+ <p:sp>
424
+ <p:nvSpPr>
425
+ <p:cNvPr id="11" name="Chart Placeholder"/>
426
+ <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
427
+ <p:nvPr><p:ph type="chart"/></p:nvPr>
428
+ </p:nvSpPr>
429
+ <p:spPr/>
430
+ </p:sp>
431
+
432
+ <!-- Shape 5: Content placeholder (NO txBody - empty) -->
433
+ <p:sp>
434
+ <p:nvSpPr>
435
+ <p:cNvPr id="12" name="Content Placeholder"/>
436
+ <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
437
+ <p:nvPr><p:ph type="body"/></p:nvPr>
438
+ </p:nvSpPr>
439
+ <p:spPr/>
440
+ </p:sp>
441
+
442
+ <!-- Shape 6: Final text shape -->
443
+ <p:sp>
444
+ <p:nvSpPr><p:cNvPr id="4" name="Footer"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
445
+ <p:spPr/>
446
+ <p:txBody>
447
+ <a:bodyPr/><a:lstStyle/>
448
+ <a:p><a:r><a:t>Third Text Shape</a:t></a:r></a:p>
449
+ </p:txBody>
450
+ </p:sp>
451
+
452
+ </p:spTree>
453
+ </p:cSld>
454
+ </p:sld>"#,
455
+ )
456
+ .unwrap();
457
+
458
+ // Add empty rels
459
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
460
+ zip.write_all(
461
+ br#"<?xml version="1.0" encoding="UTF-8"?>
462
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
463
+ </Relationships>"#,
464
+ )
465
+ .unwrap();
466
+
467
+ zip.finish().unwrap();
468
+ }
469
+
470
+ let result = extract_file(
471
+ temp_file.path(),
472
+ Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
473
+ &ExtractionConfig::default(),
474
+ )
475
+ .await;
476
+
477
+ match result {
478
+ Ok(extraction) => {
479
+ // All three text shapes should be extracted
480
+ assert!(
481
+ extraction.content.contains("First Text Shape"),
482
+ "Should extract first text shape"
483
+ );
484
+ assert!(
485
+ extraction.content.contains("Second Text Shape"),
486
+ "Should extract second text shape (after image placeholder)"
487
+ );
488
+ assert!(
489
+ extraction.content.contains("Third Text Shape"),
490
+ "Should extract third text shape (after multiple placeholders)"
491
+ );
492
+
493
+ println!("✅ PPTX mixed shapes extraction succeeded!");
494
+ println!(" All text shapes extracted despite image/chart/content placeholders without txBody");
495
+ }
496
+ Err(e) => {
497
+ panic!(
498
+ "PPTX extraction failed: {:?}\n\
499
+ Shapes without txBody should be skipped gracefully.",
500
+ e
501
+ );
502
+ }
503
+ }
504
+ }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.1.0"
3
+ version = "4.1.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -21,10 +21,10 @@ libc = { workspace = true }
21
21
  thiserror = { workspace = true }
22
22
 
23
23
  [dev-dependencies]
24
- image = { workspace = true }
24
+ image = { workspace = true, features = ["png"] }
25
25
 
26
26
  [build-dependencies]
27
- cc = { version = "^1.2.53", optional = true }
27
+ cc = { version = "^1.2.54", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
29
  zip = { version = "7.2.0", optional = true }
30
30
 
@@ -38,7 +38,7 @@ mod build_tesseract {
38
38
  return None;
39
39
  }
40
40
  }
41
- Some(path.join("tesseract-rs-cache"))
41
+ Some(path.join("kreuzberg-tesseract-cache"))
42
42
  }
43
43
 
44
44
  fn get_preferred_out_dir() -> PathBuf {
@@ -63,14 +63,14 @@ mod build_tesseract {
63
63
  PathBuf::from(home_dir)
64
64
  .join("Library")
65
65
  .join("Application Support")
66
- .join("tesseract-rs")
66
+ .join("kreuzberg-tesseract")
67
67
  } else if cfg!(target_os = "linux") {
68
68
  let home_dir = env::var("HOME").unwrap_or_else(|_| {
69
69
  env::var("USER")
70
70
  .map(|user| format!("/home/{}", user))
71
71
  .expect("Neither HOME nor USER environment variable set")
72
72
  });
73
- PathBuf::from(home_dir).join(".tesseract-rs")
73
+ PathBuf::from(home_dir).join(".kreuzberg-tesseract")
74
74
  } else {
75
75
  panic!("Unsupported operating system");
76
76
  }
@@ -117,7 +117,7 @@ mod build_tesseract {
117
117
  "cargo:warning=Failed to create cache dir {:?}: {}. Falling back to temp dir.",
118
118
  preferred, err
119
119
  );
120
- let fallback = env::temp_dir().join("tesseract-rs-cache");
120
+ let fallback = env::temp_dir().join("kreuzberg-tesseract-cache");
121
121
  fs::create_dir_all(&fallback).expect("Failed to create fallback cache directory in temp dir");
122
122
  fallback
123
123
  }
@@ -6,16 +6,16 @@
6
6
  #![allow(clippy::not_unsafe_ptr_arg_deref)]
7
7
  #![allow(clippy::cmp_null)]
8
8
 
9
- //! # tesseract-rs
9
+ //! # kreuzberg-tesseract
10
10
  //!
11
- //! `tesseract-rs` provides safe Rust bindings for Tesseract OCR with built-in compilation
11
+ //! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
12
12
  //! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
13
13
  //! easily accessible in Rust projects while handling the complexity of interfacing
14
14
  //! with the underlying C++ libraries.
15
15
  //!
16
16
  //! ## Usage
17
17
  //!
18
- //! Here's a basic example of how to use `tesseract-rs`:
18
+ //! Here's a basic example of how to use `kreuzberg-tesseract`:
19
19
  //!
20
20
  //! ```rust
21
21
  //! use std::path::PathBuf;
@@ -28,16 +28,16 @@
28
28
  //! PathBuf::from(home_dir)
29
29
  //! .join("Library")
30
30
  //! .join("Application Support")
31
- //! .join("tesseract-rs")
31
+ //! .join("kreuzberg-tesseract")
32
32
  //! .join("tessdata")
33
33
  //! } else if cfg!(target_os = "linux") {
34
34
  //! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
35
35
  //! PathBuf::from(home_dir)
36
- //! .join(".tesseract-rs")
36
+ //! .join(".kreuzberg-tesseract")
37
37
  //! .join("tessdata")
38
38
  //! } else if cfg!(target_os = "windows") {
39
39
  //! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
40
- //! .join("tesseract-rs")
40
+ //! .join("kreuzberg-tesseract")
41
41
  //! .join("tessdata")
42
42
  //! } else {
43
43
  //! panic!("Unsupported operating system");
@@ -7,7 +7,7 @@ fn get_default_tessdata_dir() -> PathBuf {
7
7
  PathBuf::from(home_dir)
8
8
  .join("Library")
9
9
  .join("Application Support")
10
- .join("tesseract-rs")
10
+ .join("kreuzberg-tesseract")
11
11
  .join("tessdata")
12
12
  } else if cfg!(target_os = "linux") {
13
13
  let system_paths = [
@@ -20,10 +20,10 @@ fn get_default_tessdata_dir() -> PathBuf {
20
20
  }
21
21
  }
22
22
  let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
23
- PathBuf::from(home_dir).join(".tesseract-rs").join("tessdata")
23
+ PathBuf::from(home_dir).join(".kreuzberg-tesseract").join("tessdata")
24
24
  } else if cfg!(target_os = "windows") {
25
25
  PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
26
- .join("tesseract-rs")
26
+ .join("kreuzberg-tesseract")
27
27
  .join("tessdata")
28
28
  } else {
29
29
  panic!("Unsupported operating system");
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.1.0
4
+ version: 4.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-22 00:00:00.000000000 Z
11
+ date: 2026-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -761,6 +761,7 @@ files:
761
761
  - vendor/kreuzberg/tests/plugin_postprocessor_test.rs
762
762
  - vendor/kreuzberg/tests/plugin_system.rs
763
763
  - vendor/kreuzberg/tests/plugin_validator_test.rs
764
+ - vendor/kreuzberg/tests/pptx_regression_tests.rs
764
765
  - vendor/kreuzberg/tests/registry_integration_tests.rs
765
766
  - vendor/kreuzberg/tests/rst_extractor_tests.rs
766
767
  - vendor/kreuzberg/tests/rtf_extractor_tests.rs
@@ -790,6 +791,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
790
791
  - - ">="
791
792
  - !ruby/object:Gem::Version
792
793
  version: 3.2.0
794
+ - - "<"
795
+ - !ruby/object:Gem::Version
796
+ version: '5.0'
793
797
  required_rubygems_version: !ruby/object:Gem::Requirement
794
798
  requirements:
795
799
  - - ">="