kreuzberg 4.2.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +56 -9
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -11,7 +11,7 @@ module Kreuzberg
|
|
|
11
11
|
# rubocop:disable Metrics/ClassLength
|
|
12
12
|
class Result
|
|
13
13
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
14
|
-
:detected_languages, :chunks, :images, :pages, :elements
|
|
14
|
+
:detected_languages, :chunks, :images, :pages, :elements, :djot_content
|
|
15
15
|
|
|
16
16
|
# @!attribute [r] cells
|
|
17
17
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -180,6 +180,7 @@ module Kreuzberg
|
|
|
180
180
|
#
|
|
181
181
|
# @param hash [Hash] Hash returned from native extension
|
|
182
182
|
#
|
|
183
|
+
# rubocop:disable Metrics/AbcSize
|
|
183
184
|
def initialize(hash)
|
|
184
185
|
@content = get_value(hash, 'content', '')
|
|
185
186
|
@mime_type = get_value(hash, 'mime_type', '')
|
|
@@ -191,7 +192,9 @@ module Kreuzberg
|
|
|
191
192
|
@images = parse_images(get_value(hash, 'images'))
|
|
192
193
|
@pages = parse_pages(get_value(hash, 'pages'))
|
|
193
194
|
@elements = parse_elements(get_value(hash, 'elements'))
|
|
195
|
+
@djot_content = parse_djot_content(get_value(hash, 'djot_content'))
|
|
194
196
|
end
|
|
197
|
+
# rubocop:enable Metrics/AbcSize
|
|
195
198
|
|
|
196
199
|
# Convert to hash
|
|
197
200
|
#
|
|
@@ -207,7 +210,8 @@ module Kreuzberg
|
|
|
207
210
|
chunks: serialize_chunks,
|
|
208
211
|
images: serialize_images,
|
|
209
212
|
pages: serialize_pages,
|
|
210
|
-
elements: serialize_elements
|
|
213
|
+
elements: serialize_elements,
|
|
214
|
+
djot_content: @djot_content&.to_h
|
|
211
215
|
}
|
|
212
216
|
end
|
|
213
217
|
|
|
@@ -434,6 +438,12 @@ module Kreuzberg
|
|
|
434
438
|
y1: coordinates_data['y1'].to_f
|
|
435
439
|
)
|
|
436
440
|
end
|
|
441
|
+
|
|
442
|
+
def parse_djot_content(djot_data)
|
|
443
|
+
return nil if djot_data.nil?
|
|
444
|
+
|
|
445
|
+
DjotContent.new(djot_data)
|
|
446
|
+
end
|
|
437
447
|
end
|
|
438
448
|
# rubocop:enable Metrics/ClassLength
|
|
439
449
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -87,6 +87,7 @@ end
|
|
|
87
87
|
|
|
88
88
|
require_relative 'kreuzberg/cache_api'
|
|
89
89
|
require_relative 'kreuzberg/extraction_api'
|
|
90
|
+
require_relative 'kreuzberg/djot_content'
|
|
90
91
|
|
|
91
92
|
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
92
93
|
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -417,14 +417,23 @@ module Kreuzberg
|
|
|
417
417
|
attr_reader plain_text: String
|
|
418
418
|
attr_reader blocks: Array[DjotContent::FormattedBlock]
|
|
419
419
|
attr_reader metadata: Hash[untyped, untyped]
|
|
420
|
-
attr_reader
|
|
420
|
+
attr_reader metadata_json: String
|
|
421
|
+
attr_reader tables: Array[untyped]
|
|
421
422
|
attr_reader images: Array[DjotContent::DjotImage]
|
|
422
423
|
attr_reader links: Array[DjotContent::DjotLink]
|
|
423
424
|
attr_reader footnotes: Array[DjotContent::Footnote]
|
|
424
425
|
attr_reader attributes: Hash[String, untyped]?
|
|
425
426
|
|
|
426
|
-
def initialize: (
|
|
427
|
-
def to_h: () ->
|
|
427
|
+
def initialize: (untyped hash) -> void
|
|
428
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
429
|
+
|
|
430
|
+
private
|
|
431
|
+
|
|
432
|
+
def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
|
|
433
|
+
def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
|
|
434
|
+
def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
|
|
435
|
+
def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
|
|
436
|
+
def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
|
|
428
437
|
|
|
429
438
|
class FormattedBlock
|
|
430
439
|
attr_reader block_type: String
|
|
@@ -433,28 +442,31 @@ module Kreuzberg
|
|
|
433
442
|
attr_reader children: Array[FormattedBlock]?
|
|
434
443
|
attr_reader attributes: Hash[String, untyped]?
|
|
435
444
|
|
|
436
|
-
def initialize: (
|
|
437
|
-
def to_h: () ->
|
|
445
|
+
def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
|
|
446
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
438
447
|
end
|
|
439
448
|
|
|
440
449
|
class DjotImage
|
|
441
450
|
attr_reader url: String
|
|
442
451
|
attr_reader alt: String?
|
|
443
452
|
attr_reader title: String?
|
|
444
|
-
attr_reader
|
|
453
|
+
attr_reader width: Integer?
|
|
454
|
+
attr_reader height: Integer?
|
|
445
455
|
|
|
446
|
-
def initialize: (
|
|
447
|
-
def
|
|
456
|
+
def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
|
|
457
|
+
def src: () -> String
|
|
458
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
448
459
|
end
|
|
449
460
|
|
|
450
461
|
class DjotLink
|
|
451
462
|
attr_reader url: String
|
|
452
|
-
attr_reader text: String
|
|
463
|
+
attr_reader text: String?
|
|
453
464
|
attr_reader title: String?
|
|
454
465
|
attr_reader link_type: String?
|
|
455
466
|
|
|
456
|
-
def initialize: (
|
|
457
|
-
def
|
|
467
|
+
def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
|
|
468
|
+
def href: () -> String
|
|
469
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
458
470
|
end
|
|
459
471
|
|
|
460
472
|
class Footnote
|
data/spec/binding/batch_spec.rb
CHANGED
|
@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
|
|
|
295
295
|
end
|
|
296
296
|
|
|
297
297
|
describe 'batch error handling' do
|
|
298
|
-
it '
|
|
298
|
+
it 'raises IOError for missing files in batch' do
|
|
299
299
|
paths = [
|
|
300
300
|
'/nonexistent/file1.txt',
|
|
301
301
|
'/nonexistent/file2.txt'
|
|
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
|
|
|
303
303
|
|
|
304
304
|
expect do
|
|
305
305
|
described_class.batch_extract_files_sync(paths: paths)
|
|
306
|
-
end.
|
|
306
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
307
307
|
end
|
|
308
308
|
|
|
309
|
-
it '
|
|
309
|
+
it 'raises IOError when batch contains invalid paths' do
|
|
310
310
|
paths = []
|
|
311
311
|
temp_dir = Dir.mktmpdir
|
|
312
312
|
|
|
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
|
|
|
316
316
|
|
|
317
317
|
paths << '/nonexistent/invalid.txt'
|
|
318
318
|
|
|
319
|
-
|
|
320
|
-
|
|
319
|
+
expect do
|
|
320
|
+
described_class.batch_extract_files_sync(paths: paths)
|
|
321
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
321
322
|
ensure
|
|
322
323
|
FileUtils.remove_entry(temp_dir)
|
|
323
324
|
end
|
|
@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
57
57
|
nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
|
|
58
58
|
|
|
59
59
|
expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
|
|
60
|
-
.to raise_error(Kreuzberg::Errors::
|
|
60
|
+
.to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
|
|
61
61
|
end
|
|
62
62
|
|
|
63
63
|
it 'provides descriptive error messages for invalid MIME types' do
|
|
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
293
293
|
|
|
294
294
|
expect(validation_error).to be_a(ArgumentError)
|
|
295
295
|
|
|
296
|
-
# Runtime error (file not found)
|
|
296
|
+
# Runtime error (file not found) - IOError since the file doesn't exist
|
|
297
297
|
runtime_error = nil
|
|
298
298
|
begin
|
|
299
299
|
Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
|
|
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
301
301
|
runtime_error = e
|
|
302
302
|
end
|
|
303
303
|
|
|
304
|
-
expect(runtime_error).to be_a(Kreuzberg::Errors::
|
|
304
|
+
expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
|
|
305
305
|
end
|
|
306
306
|
|
|
307
307
|
it 'provides error recovery suggestions in messages' do
|
data/spec/binding/tables_spec.rb
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
4
6
|
|
|
5
7
|
RSpec.describe 'Table Extraction Quality' do
|
|
6
8
|
describe 'table structure extraction' do
|
|
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
|
|
|
523
525
|
it 'handles documents with no tables gracefully' do
|
|
524
526
|
config = Kreuzberg::Config::Extraction.new
|
|
525
527
|
|
|
528
|
+
# Create a temporary text file for this test
|
|
529
|
+
file = Tempfile.new(['no_tables_test', '.txt'])
|
|
530
|
+
file.write('This is a text document without any tables.')
|
|
531
|
+
file.close
|
|
532
|
+
|
|
526
533
|
begin
|
|
527
|
-
result = Kreuzberg.extract_file(path:
|
|
534
|
+
result = Kreuzberg.extract_file(path: file.path, config: config)
|
|
528
535
|
expect(result).not_to be_nil
|
|
529
536
|
expect(result.tables).to be_a(Array) if result.tables
|
|
530
|
-
rescue Kreuzberg::Errors::
|
|
537
|
+
rescue Kreuzberg::Errors::IOError
|
|
531
538
|
skip 'Text file not available for testing'
|
|
539
|
+
ensure
|
|
540
|
+
FileUtils.rm_f(file.path)
|
|
532
541
|
end
|
|
533
542
|
end
|
|
534
543
|
|
|
@@ -282,34 +282,34 @@ RSpec.describe 'Output Format and Result Format Configuration' do
|
|
|
282
282
|
end
|
|
283
283
|
|
|
284
284
|
describe 'format validation and edge cases' do
|
|
285
|
-
it '
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
285
|
+
it 'raises error for empty string output_format' do
|
|
286
|
+
expect do
|
|
287
|
+
described_class.new(output_format: '')
|
|
288
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
289
289
|
end
|
|
290
290
|
|
|
291
|
-
it '
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
291
|
+
it 'raises error for empty string result_format' do
|
|
292
|
+
expect do
|
|
293
|
+
described_class.new(result_format: '')
|
|
294
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
295
295
|
end
|
|
296
296
|
|
|
297
|
-
it '
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
297
|
+
it 'raises error for whitespace in output_format' do
|
|
298
|
+
expect do
|
|
299
|
+
described_class.new(output_format: ' plain ')
|
|
300
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
301
301
|
end
|
|
302
302
|
|
|
303
|
-
it '
|
|
303
|
+
it 'normalizes case in output_format' do
|
|
304
304
|
config = described_class.new(output_format: 'MarkDown')
|
|
305
305
|
|
|
306
|
-
expect(config.output_format).to eq '
|
|
306
|
+
expect(config.output_format).to eq 'markdown'
|
|
307
307
|
end
|
|
308
308
|
|
|
309
|
-
it '
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
309
|
+
it 'raises error for custom string in result_format' do
|
|
310
|
+
expect do
|
|
311
|
+
described_class.new(result_format: 'custom_format')
|
|
312
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
313
313
|
end
|
|
314
314
|
end
|
|
315
315
|
|
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.1 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
use std::net::{IpAddr, SocketAddr};
|
|
4
4
|
|
|
5
|
-
use crate::{ExtractionConfig, Result, core::ServerConfig};
|
|
5
|
+
use crate::{ExtractionConfig, Result, core::ServerConfig, plugins::startup_validation::validate_plugins_at_startup};
|
|
6
6
|
|
|
7
7
|
use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
|
|
8
8
|
|
|
@@ -80,6 +80,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
|
80
80
|
server_config.max_multipart_field_bytes,
|
|
81
81
|
);
|
|
82
82
|
|
|
83
|
+
// Validate plugins at startup
|
|
84
|
+
validate_plugins_at_startup()?;
|
|
85
|
+
|
|
83
86
|
serve_with_config_and_limits(host, port, extraction_config, limits).await
|
|
84
87
|
}
|
|
85
88
|
|
|
@@ -111,6 +114,10 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
|
|
|
111
114
|
"Upload size limit: 100 MB (default, {} bytes)",
|
|
112
115
|
limits.max_request_body_bytes
|
|
113
116
|
);
|
|
117
|
+
|
|
118
|
+
// Validate plugins at startup
|
|
119
|
+
validate_plugins_at_startup()?;
|
|
120
|
+
|
|
114
121
|
serve_with_config_and_limits(host, port, config, limits).await
|
|
115
122
|
}
|
|
116
123
|
|
|
@@ -158,6 +165,9 @@ pub async fn serve_with_config_and_limits(
|
|
|
158
165
|
let addr = SocketAddr::new(ip, port);
|
|
159
166
|
let app = create_router_with_limits_and_server_config(config, limits, server_config);
|
|
160
167
|
|
|
168
|
+
// Validate plugins at startup
|
|
169
|
+
validate_plugins_at_startup()?;
|
|
170
|
+
|
|
161
171
|
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
162
172
|
|
|
163
173
|
let listener = tokio::net::TcpListener::bind(addr)
|
|
@@ -214,6 +224,9 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
214
224
|
let addr = SocketAddr::new(ip, server_config.port);
|
|
215
225
|
let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
|
|
216
226
|
|
|
227
|
+
// Validate plugins at startup
|
|
228
|
+
validate_plugins_at_startup()?;
|
|
229
|
+
|
|
217
230
|
tracing::info!(
|
|
218
231
|
"Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
|
|
219
232
|
ip,
|
|
@@ -238,6 +251,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
238
251
|
/// Defaults: host = "127.0.0.1", port = 8000
|
|
239
252
|
///
|
|
240
253
|
/// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
|
|
254
|
+
/// Validates plugins at startup to help diagnose configuration issues.
|
|
241
255
|
pub async fn serve_default() -> Result<()> {
|
|
242
256
|
serve("127.0.0.1", 8000).await
|
|
243
257
|
}
|
|
@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
|
|
30
30
|
/// Valid tesseract OEM (OCR Engine Mode) values.
|
|
31
31
|
const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
|
|
32
32
|
|
|
33
|
-
/// Valid output formats for
|
|
34
|
-
|
|
33
|
+
/// Valid output formats for document extraction.
|
|
34
|
+
/// Supports plain text, markdown, djot, and HTML output formats.
|
|
35
|
+
/// Also accepts aliases: "text" for "plain", "md" for "markdown".
|
|
36
|
+
const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
|
|
35
37
|
|
|
36
38
|
/// Validate a binarization method string.
|
|
37
39
|
///
|
|
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
|
248
250
|
}
|
|
249
251
|
}
|
|
250
252
|
|
|
251
|
-
/// Validate a
|
|
253
|
+
/// Validate a document extraction output format.
|
|
254
|
+
///
|
|
255
|
+
/// Accepts the following formats and aliases:
|
|
256
|
+
/// - "plain" or "text" for plain text output
|
|
257
|
+
/// - "markdown" or "md" for Markdown output
|
|
258
|
+
/// - "djot" for Djot markup format
|
|
259
|
+
/// - "html" for HTML output
|
|
252
260
|
///
|
|
253
261
|
/// # Arguments
|
|
254
262
|
///
|
|
255
|
-
/// * `format` - The output format to validate
|
|
263
|
+
/// * `format` - The output format to validate
|
|
256
264
|
///
|
|
257
265
|
/// # Returns
|
|
258
266
|
///
|
|
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
|
264
272
|
/// use kreuzberg::core::config_validation::validate_output_format;
|
|
265
273
|
///
|
|
266
274
|
/// assert!(validate_output_format("text").is_ok());
|
|
275
|
+
/// assert!(validate_output_format("plain").is_ok());
|
|
267
276
|
/// assert!(validate_output_format("markdown").is_ok());
|
|
277
|
+
/// assert!(validate_output_format("md").is_ok());
|
|
278
|
+
/// assert!(validate_output_format("djot").is_ok());
|
|
279
|
+
/// assert!(validate_output_format("html").is_ok());
|
|
268
280
|
/// assert!(validate_output_format("json").is_err());
|
|
269
281
|
/// ```
|
|
270
282
|
pub fn validate_output_format(format: &str) -> Result<()> {
|
|
@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
|
|
|
106
106
|
///
|
|
107
107
|
/// # Errors
|
|
108
108
|
///
|
|
109
|
-
/// Returns `KreuzbergError::
|
|
109
|
+
/// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
|
110
110
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|
111
|
-
/// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
|
|
112
111
|
///
|
|
113
112
|
/// # Example
|
|
114
113
|
///
|
|
@@ -411,7 +411,8 @@ mod tests {
|
|
|
411
411
|
|
|
412
412
|
assert!(result.is_err());
|
|
413
413
|
use crate::KreuzbergError;
|
|
414
|
-
|
|
414
|
+
// File validation returns Io error, not Validation error
|
|
415
|
+
assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
|
|
415
416
|
}
|
|
416
417
|
|
|
417
418
|
#[test]
|
|
@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
|
|
|
61
61
|
///
|
|
62
62
|
/// # Errors
|
|
63
63
|
///
|
|
64
|
-
/// Returns `KreuzbergError::
|
|
64
|
+
/// Returns `KreuzbergError::Io` if file doesn't exist.
|
|
65
65
|
pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
|
|
66
66
|
if !file_exists(&path) {
|
|
67
|
-
return Err(KreuzbergError::
|
|
68
|
-
|
|
69
|
-
path.as_ref().display()
|
|
67
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
68
|
+
std::io::ErrorKind::NotFound,
|
|
69
|
+
format!("File does not exist: {}", path.as_ref().display()),
|
|
70
70
|
)));
|
|
71
71
|
}
|
|
72
72
|
Ok(())
|
|
@@ -99,9 +99,9 @@ where
|
|
|
99
99
|
let mut files = Vec::new();
|
|
100
100
|
|
|
101
101
|
if !dir.is_dir() {
|
|
102
|
-
return Err(KreuzbergError::
|
|
103
|
-
|
|
104
|
-
dir.display()
|
|
102
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
103
|
+
std::io::ErrorKind::NotADirectory,
|
|
104
|
+
format!("Path is not a directory: {}", dir.display()),
|
|
105
105
|
)));
|
|
106
106
|
}
|
|
107
107
|
|
|
@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
231
231
|
///
|
|
232
232
|
/// # Errors
|
|
233
233
|
///
|
|
234
|
-
/// Returns `KreuzbergError::
|
|
234
|
+
/// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
|
|
235
235
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
|
236
236
|
pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
|
|
237
237
|
let path = path.as_ref();
|
|
238
238
|
|
|
239
239
|
if check_exists && !path.exists() {
|
|
240
|
-
return Err(KreuzbergError::
|
|
241
|
-
|
|
242
|
-
path.display()
|
|
240
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
241
|
+
std::io::ErrorKind::NotFound,
|
|
242
|
+
format!("File does not exist: {}", path.display()),
|
|
243
243
|
)));
|
|
244
244
|
}
|
|
245
245
|
|
|
@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
|
|
|
384
384
|
}
|
|
385
385
|
}
|
|
386
386
|
|
|
387
|
+
// Sort slide paths to ensure correct ordering regardless of XML order.
|
|
388
|
+
// PowerPoint doesn't guarantee relationship order in the rels file.
|
|
389
|
+
// GitHub Issue #329: Without sorting, slides can be processed in wrong order,
|
|
390
|
+
// causing images to have incorrect page numbers.
|
|
391
|
+
slide_paths.sort();
|
|
392
|
+
|
|
387
393
|
Ok(slide_paths)
|
|
388
394
|
}
|