kreuzberg 4.2.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/lib/kreuzberg/cli.rb +16 -6
  6. data/lib/kreuzberg/cli_proxy.rb +3 -1
  7. data/lib/kreuzberg/config.rb +56 -9
  8. data/lib/kreuzberg/djot_content.rb +225 -0
  9. data/lib/kreuzberg/extraction_api.rb +20 -4
  10. data/lib/kreuzberg/result.rb +12 -2
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +1 -0
  13. data/sig/kreuzberg.rbs +23 -11
  14. data/spec/binding/batch_spec.rb +6 -5
  15. data/spec/binding/error_recovery_spec.rb +3 -3
  16. data/spec/binding/tables_spec.rb +11 -2
  17. data/spec/unit/config/output_format_spec.rb +18 -18
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +1 -1
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  22. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  23. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  24. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  25. data/vendor/kreuzberg/src/core/io.rs +7 -7
  26. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  27. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  28. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  29. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  30. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  31. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  32. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  33. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  34. data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
  35. data/vendor/kreuzberg/tests/core_integration.rs +2 -4
  36. data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
  37. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
  38. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  39. metadata +4 -2
@@ -11,7 +11,7 @@ module Kreuzberg
11
11
  # rubocop:disable Metrics/ClassLength
12
12
  class Result
13
13
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
14
- :detected_languages, :chunks, :images, :pages, :elements
14
+ :detected_languages, :chunks, :images, :pages, :elements, :djot_content
15
15
 
16
16
  # @!attribute [r] cells
17
17
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -180,6 +180,7 @@ module Kreuzberg
180
180
  #
181
181
  # @param hash [Hash] Hash returned from native extension
182
182
  #
183
+ # rubocop:disable Metrics/AbcSize
183
184
  def initialize(hash)
184
185
  @content = get_value(hash, 'content', '')
185
186
  @mime_type = get_value(hash, 'mime_type', '')
@@ -191,7 +192,9 @@ module Kreuzberg
191
192
  @images = parse_images(get_value(hash, 'images'))
192
193
  @pages = parse_pages(get_value(hash, 'pages'))
193
194
  @elements = parse_elements(get_value(hash, 'elements'))
195
+ @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
194
196
  end
197
+ # rubocop:enable Metrics/AbcSize
195
198
 
196
199
  # Convert to hash
197
200
  #
@@ -207,7 +210,8 @@ module Kreuzberg
207
210
  chunks: serialize_chunks,
208
211
  images: serialize_images,
209
212
  pages: serialize_pages,
210
- elements: serialize_elements
213
+ elements: serialize_elements,
214
+ djot_content: @djot_content&.to_h
211
215
  }
212
216
  end
213
217
 
@@ -434,6 +438,12 @@ module Kreuzberg
434
438
  y1: coordinates_data['y1'].to_f
435
439
  )
436
440
  end
441
+
442
+ def parse_djot_content(djot_data)
443
+ return nil if djot_data.nil?
444
+
445
+ DjotContent.new(djot_data)
446
+ end
437
447
  end
438
448
  # rubocop:enable Metrics/ClassLength
439
449
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.0'
4
+ VERSION = '4.2.1'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -87,6 +87,7 @@ end
87
87
 
88
88
  require_relative 'kreuzberg/cache_api'
89
89
  require_relative 'kreuzberg/extraction_api'
90
+ require_relative 'kreuzberg/djot_content'
90
91
 
91
92
  Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
92
93
  Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
data/sig/kreuzberg.rbs CHANGED
@@ -417,14 +417,23 @@ module Kreuzberg
417
417
  attr_reader plain_text: String
418
418
  attr_reader blocks: Array[DjotContent::FormattedBlock]
419
419
  attr_reader metadata: Hash[untyped, untyped]
420
- attr_reader tables: Array[Table]
420
+ attr_reader metadata_json: String
421
+ attr_reader tables: Array[untyped]
421
422
  attr_reader images: Array[DjotContent::DjotImage]
422
423
  attr_reader links: Array[DjotContent::DjotLink]
423
424
  attr_reader footnotes: Array[DjotContent::Footnote]
424
425
  attr_reader attributes: Hash[String, untyped]?
425
426
 
426
- def initialize: (djot_content_hash hash) -> void
427
- def to_h: () -> djot_content_hash
427
+ def initialize: (untyped hash) -> void
428
+ def to_h: () -> Hash[Symbol, untyped]
429
+
430
+ private
431
+
432
+ def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
433
+ def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
434
+ def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
435
+ def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
436
+ def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
428
437
 
429
438
  class FormattedBlock
430
439
  attr_reader block_type: String
@@ -433,28 +442,31 @@ module Kreuzberg
433
442
  attr_reader children: Array[FormattedBlock]?
434
443
  attr_reader attributes: Hash[String, untyped]?
435
444
 
436
- def initialize: (formatted_block_hash hash) -> void
437
- def to_h: () -> formatted_block_hash
445
+ def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
446
+ def to_h: () -> Hash[Symbol, untyped]
438
447
  end
439
448
 
440
449
  class DjotImage
441
450
  attr_reader url: String
442
451
  attr_reader alt: String?
443
452
  attr_reader title: String?
444
- attr_reader attributes: Hash[String, untyped]?
453
+ attr_reader width: Integer?
454
+ attr_reader height: Integer?
445
455
 
446
- def initialize: (djot_image_hash hash) -> void
447
- def to_h: () -> djot_image_hash
456
+ def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
457
+ def src: () -> String
458
+ def to_h: () -> Hash[Symbol, untyped]
448
459
  end
449
460
 
450
461
  class DjotLink
451
462
  attr_reader url: String
452
- attr_reader text: String
463
+ attr_reader text: String?
453
464
  attr_reader title: String?
454
465
  attr_reader link_type: String?
455
466
 
456
- def initialize: (djot_link_hash hash) -> void
457
- def to_h: () -> djot_link_hash
467
+ def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
468
+ def href: () -> String
469
+ def to_h: () -> Hash[Symbol, untyped]
458
470
  end
459
471
 
460
472
  class Footnote
@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
295
295
  end
296
296
 
297
297
  describe 'batch error handling' do
298
- it 'handles missing files gracefully in batch' do
298
+ it 'raises IOError for missing files in batch' do
299
299
  paths = [
300
300
  '/nonexistent/file1.txt',
301
301
  '/nonexistent/file2.txt'
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
303
303
 
304
304
  expect do
305
305
  described_class.batch_extract_files_sync(paths: paths)
306
- end.not_to raise_error
306
+ end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
307
307
  end
308
308
 
309
- it 'handles mixed valid and invalid paths' do
309
+ it 'raises IOError when batch contains invalid paths' do
310
310
  paths = []
311
311
  temp_dir = Dir.mktmpdir
312
312
 
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
316
316
 
317
317
  paths << '/nonexistent/invalid.txt'
318
318
 
319
- results = described_class.batch_extract_files_sync(paths: paths)
320
- expect(results).to be_a(Array)
319
+ expect do
320
+ described_class.batch_extract_files_sync(paths: paths)
321
+ end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
321
322
  ensure
322
323
  FileUtils.remove_entry(temp_dir)
323
324
  end
@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
57
57
  nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
58
58
 
59
59
  expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
60
- .to raise_error(Kreuzberg::Errors::ValidationError, /not found|does not exist|no such file/)
60
+ .to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
61
61
  end
62
62
 
63
63
  it 'provides descriptive error messages for invalid MIME types' do
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
293
293
 
294
294
  expect(validation_error).to be_a(ArgumentError)
295
295
 
296
- # Runtime error (file not found)
296
+ # Runtime error (file not found) - IOError since the file doesn't exist
297
297
  runtime_error = nil
298
298
  begin
299
299
  Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
301
301
  runtime_error = e
302
302
  end
303
303
 
304
- expect(runtime_error).to be_a(Kreuzberg::Errors::ValidationError)
304
+ expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
305
305
  end
306
306
 
307
307
  it 'provides error recovery suggestions in messages' do
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'spec_helper'
4
+ require 'tempfile'
5
+ require 'fileutils'
4
6
 
5
7
  RSpec.describe 'Table Extraction Quality' do
6
8
  describe 'table structure extraction' do
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
523
525
  it 'handles documents with no tables gracefully' do
524
526
  config = Kreuzberg::Config::Extraction.new
525
527
 
528
+ # Create a temporary text file for this test
529
+ file = Tempfile.new(['no_tables_test', '.txt'])
530
+ file.write('This is a text document without any tables.')
531
+ file.close
532
+
526
533
  begin
527
- result = Kreuzberg.extract_file(path: 'test.txt', config: config)
534
+ result = Kreuzberg.extract_file(path: file.path, config: config)
528
535
  expect(result).not_to be_nil
529
536
  expect(result.tables).to be_a(Array) if result.tables
530
- rescue Kreuzberg::Errors::ValidationError
537
+ rescue Kreuzberg::Errors::IOError
531
538
  skip 'Text file not available for testing'
539
+ ensure
540
+ FileUtils.rm_f(file.path)
532
541
  end
533
542
  end
534
543
 
@@ -282,34 +282,34 @@ RSpec.describe 'Output Format and Result Format Configuration' do
282
282
  end
283
283
 
284
284
  describe 'format validation and edge cases' do
285
- it 'handles empty string output_format' do
286
- config = described_class.new(output_format: '')
287
-
288
- expect(config.output_format).to eq ''
285
+ it 'raises error for empty string output_format' do
286
+ expect do
287
+ described_class.new(output_format: '')
288
+ end.to raise_error(ArgumentError, /Invalid output_format/)
289
289
  end
290
290
 
291
- it 'handles empty string result_format' do
292
- config = described_class.new(result_format: '')
293
-
294
- expect(config.result_format).to eq ''
291
+ it 'raises error for empty string result_format' do
292
+ expect do
293
+ described_class.new(result_format: '')
294
+ end.to raise_error(ArgumentError, /Invalid result_format/)
295
295
  end
296
296
 
297
- it 'handles whitespace in output_format' do
298
- config = described_class.new(output_format: ' plain ')
299
-
300
- expect(config.output_format).to eq ' plain '
297
+ it 'raises error for whitespace in output_format' do
298
+ expect do
299
+ described_class.new(output_format: ' plain ')
300
+ end.to raise_error(ArgumentError, /Invalid output_format/)
301
301
  end
302
302
 
303
- it 'handles case sensitivity in output_format' do
303
+ it 'normalizes case in output_format' do
304
304
  config = described_class.new(output_format: 'MarkDown')
305
305
 
306
- expect(config.output_format).to eq 'MarkDown'
306
+ expect(config.output_format).to eq 'markdown'
307
307
  end
308
308
 
309
- it 'handles custom string in result_format' do
310
- config = described_class.new(result_format: 'custom_format')
311
-
312
- expect(config.result_format).to eq 'custom_format'
309
+ it 'raises error for custom string in result_format' do
310
+ expect do
311
+ described_class.new(result_format: 'custom_format')
312
+ end.to raise_error(ArgumentError, /Invalid result_format/)
313
313
  end
314
314
  end
315
315
 
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.0"
6
+ version = "4.2.1"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.0"
3
+ version = "4.2.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.0 Release**
20
+ > **🚀 Version 4.2.1 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -2,7 +2,7 @@
2
2
 
3
3
  use std::net::{IpAddr, SocketAddr};
4
4
 
5
- use crate::{ExtractionConfig, Result, core::ServerConfig};
5
+ use crate::{ExtractionConfig, Result, core::ServerConfig, plugins::startup_validation::validate_plugins_at_startup};
6
6
 
7
7
  use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
8
8
 
@@ -80,6 +80,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
80
80
  server_config.max_multipart_field_bytes,
81
81
  );
82
82
 
83
+ // Validate plugins at startup
84
+ validate_plugins_at_startup()?;
85
+
83
86
  serve_with_config_and_limits(host, port, extraction_config, limits).await
84
87
  }
85
88
 
@@ -111,6 +114,10 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
111
114
  "Upload size limit: 100 MB (default, {} bytes)",
112
115
  limits.max_request_body_bytes
113
116
  );
117
+
118
+ // Validate plugins at startup
119
+ validate_plugins_at_startup()?;
120
+
114
121
  serve_with_config_and_limits(host, port, config, limits).await
115
122
  }
116
123
 
@@ -158,6 +165,9 @@ pub async fn serve_with_config_and_limits(
158
165
  let addr = SocketAddr::new(ip, port);
159
166
  let app = create_router_with_limits_and_server_config(config, limits, server_config);
160
167
 
168
+ // Validate plugins at startup
169
+ validate_plugins_at_startup()?;
170
+
161
171
  tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
162
172
 
163
173
  let listener = tokio::net::TcpListener::bind(addr)
@@ -214,6 +224,9 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
214
224
  let addr = SocketAddr::new(ip, server_config.port);
215
225
  let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
216
226
 
227
+ // Validate plugins at startup
228
+ validate_plugins_at_startup()?;
229
+
217
230
  tracing::info!(
218
231
  "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
219
232
  ip,
@@ -238,6 +251,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
238
251
  /// Defaults: host = "127.0.0.1", port = 8000
239
252
  ///
240
253
  /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
254
+ /// Validates plugins at startup to help diagnose configuration issues.
241
255
  pub async fn serve_default() -> Result<()> {
242
256
  serve("127.0.0.1", 8000).await
243
257
  }
@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
30
30
  /// Valid tesseract OEM (OCR Engine Mode) values.
31
31
  const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
32
32
 
33
- /// Valid output formats for tesseract.
34
- const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
33
+ /// Valid output formats for document extraction.
34
+ /// Supports plain text, markdown, djot, and HTML output formats.
35
+ /// Also accepts aliases: "text" for "plain", "md" for "markdown".
36
+ const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
35
37
 
36
38
  /// Validate a binarization method string.
37
39
  ///
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
248
250
  }
249
251
  }
250
252
 
251
- /// Validate a tesseract output format.
253
+ /// Validate a document extraction output format.
254
+ ///
255
+ /// Accepts the following formats and aliases:
256
+ /// - "plain" or "text" for plain text output
257
+ /// - "markdown" or "md" for Markdown output
258
+ /// - "djot" for Djot markup format
259
+ /// - "html" for HTML output
252
260
  ///
253
261
  /// # Arguments
254
262
  ///
255
- /// * `format` - The output format to validate (e.g., "text", "markdown")
263
+ /// * `format` - The output format to validate
256
264
  ///
257
265
  /// # Returns
258
266
  ///
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
264
272
  /// use kreuzberg::core::config_validation::validate_output_format;
265
273
  ///
266
274
  /// assert!(validate_output_format("text").is_ok());
275
+ /// assert!(validate_output_format("plain").is_ok());
267
276
  /// assert!(validate_output_format("markdown").is_ok());
277
+ /// assert!(validate_output_format("md").is_ok());
278
+ /// assert!(validate_output_format("djot").is_ok());
279
+ /// assert!(validate_output_format("html").is_ok());
268
280
  /// assert!(validate_output_format("json").is_err());
269
281
  /// ```
270
282
  pub fn validate_output_format(format: &str) -> Result<()> {
@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
106
106
  ///
107
107
  /// # Errors
108
108
  ///
109
- /// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
109
+ /// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
110
110
  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
111
- /// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
112
111
  ///
113
112
  /// # Example
114
113
  ///
@@ -411,7 +411,8 @@ mod tests {
411
411
 
412
412
  assert!(result.is_err());
413
413
  use crate::KreuzbergError;
414
- assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
414
+ // File validation returns Io error, not Validation error
415
+ assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
415
416
  }
416
417
 
417
418
  #[test]
@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
61
61
  ///
62
62
  /// # Errors
63
63
  ///
64
- /// Returns `KreuzbergError::Validation` if file doesn't exist.
64
+ /// Returns `KreuzbergError::Io` if file doesn't exist.
65
65
  pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
66
66
  if !file_exists(&path) {
67
- return Err(KreuzbergError::validation(format!(
68
- "File does not exist: {}",
69
- path.as_ref().display()
67
+ return Err(KreuzbergError::from(std::io::Error::new(
68
+ std::io::ErrorKind::NotFound,
69
+ format!("File does not exist: {}", path.as_ref().display()),
70
70
  )));
71
71
  }
72
72
  Ok(())
@@ -99,9 +99,9 @@ where
99
99
  let mut files = Vec::new();
100
100
 
101
101
  if !dir.is_dir() {
102
- return Err(KreuzbergError::validation(format!(
103
- "Path is not a directory: {}",
104
- dir.display()
102
+ return Err(KreuzbergError::from(std::io::Error::new(
103
+ std::io::ErrorKind::NotADirectory,
104
+ format!("Path is not a directory: {}", dir.display()),
105
105
  )));
106
106
  }
107
107
 
@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
231
231
  ///
232
232
  /// # Errors
233
233
  ///
234
- /// Returns `KreuzbergError::Validation` if file doesn't exist (when `check_exists` is true).
234
+ /// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
235
235
  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
236
236
  pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
237
237
  let path = path.as_ref();
238
238
 
239
239
  if check_exists && !path.exists() {
240
- return Err(KreuzbergError::validation(format!(
241
- "File does not exist: {}",
242
- path.display()
240
+ return Err(KreuzbergError::from(std::io::Error::new(
241
+ std::io::ErrorKind::NotFound,
242
+ format!("File does not exist: {}", path.display()),
243
243
  )));
244
244
  }
245
245
 
@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
384
384
  }
385
385
  }
386
386
 
387
+ // Sort slide paths to ensure correct ordering regardless of XML order.
388
+ // PowerPoint doesn't guarantee relationship order in the rels file.
389
+ // GitHub Issue #329: Without sorting, slides can be processed in wrong order,
390
+ // causing images to have incorrect page numbers.
391
+ slide_paths.sort();
392
+
387
393
  Ok(slide_paths)
388
394
  }
@@ -206,6 +206,7 @@ mod extractor;
206
206
  mod ocr;
207
207
  mod processor;
208
208
  pub mod registry;
209
+ pub mod startup_validation;
209
210
  mod traits;
210
211
  mod validator;
211
212