kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,99 +1,52 @@
|
|
|
1
|
-
use super::bindings::bind_pdfium;
|
|
2
1
|
use super::error::{PdfError, Result};
|
|
3
|
-
use crate::types::{PageBoundary, PageInfo, PageStructure, PageUnitType};
|
|
4
2
|
use pdfium_render::prelude::*;
|
|
5
3
|
use serde::{Deserialize, Serialize};
|
|
6
4
|
|
|
7
|
-
/// PDF-specific metadata.
|
|
8
|
-
///
|
|
9
|
-
/// Contains metadata fields specific to PDF documents that are not in the common
|
|
10
|
-
/// `Metadata` structure. Common fields like title, authors, keywords, and dates
|
|
11
|
-
/// are now at the `Metadata` level.
|
|
12
5
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
13
6
|
pub struct PdfMetadata {
|
|
14
|
-
/// PDF version (e.g., "1.7", "2.0")
|
|
15
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
16
|
-
pub pdf_version: Option<String>,
|
|
17
|
-
|
|
18
|
-
/// PDF producer (application that created the PDF)
|
|
19
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
20
|
-
pub producer: Option<String>,
|
|
21
|
-
|
|
22
|
-
/// Whether the PDF is encrypted/password-protected
|
|
23
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
24
|
-
pub is_encrypted: Option<bool>,
|
|
25
|
-
|
|
26
|
-
/// First page width in points (1/72 inch)
|
|
27
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
28
|
-
pub width: Option<i64>,
|
|
29
|
-
|
|
30
|
-
/// First page height in points (1/72 inch)
|
|
31
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
32
|
-
pub height: Option<i64>,
|
|
33
|
-
|
|
34
|
-
/// Total number of pages in the PDF document
|
|
35
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
36
|
-
pub page_count: Option<usize>,
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/// Complete PDF extraction metadata including common and PDF-specific fields.
|
|
40
|
-
///
|
|
41
|
-
/// This struct combines common document fields (title, authors, dates) with
|
|
42
|
-
/// PDF-specific metadata and optional page structure information. It is returned
|
|
43
|
-
/// by `extract_metadata_from_document()` when page boundaries are provided.
|
|
44
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
45
|
-
pub struct PdfExtractionMetadata {
|
|
46
|
-
/// Document title
|
|
47
7
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
48
8
|
pub title: Option<String>,
|
|
49
|
-
|
|
50
|
-
/// Document subject or description
|
|
51
9
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
52
10
|
pub subject: Option<String>,
|
|
53
|
-
|
|
54
|
-
/// Document authors (parsed from PDF Author field)
|
|
55
11
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
56
12
|
pub authors: Option<Vec<String>>,
|
|
57
|
-
|
|
58
|
-
/// Document keywords (parsed from PDF Keywords field)
|
|
59
13
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
60
14
|
pub keywords: Option<Vec<String>>,
|
|
61
|
-
|
|
62
|
-
/// Creation timestamp (ISO 8601 format)
|
|
63
15
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
64
16
|
pub created_at: Option<String>,
|
|
65
|
-
|
|
66
|
-
/// Last modification timestamp (ISO 8601 format)
|
|
67
17
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
68
18
|
pub modified_at: Option<String>,
|
|
69
|
-
|
|
70
|
-
/// Application or user that created the document
|
|
71
19
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
72
20
|
pub created_by: Option<String>,
|
|
73
|
-
|
|
74
|
-
/// PDF-specific metadata
|
|
75
|
-
pub pdf_specific: PdfMetadata,
|
|
76
|
-
|
|
77
|
-
/// Page structure with boundaries and optional per-page metadata
|
|
78
21
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
79
|
-
pub
|
|
22
|
+
pub producer: Option<String>,
|
|
23
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
24
|
+
pub page_count: Option<usize>,
|
|
25
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
26
|
+
pub pdf_version: Option<String>,
|
|
27
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
28
|
+
pub is_encrypted: Option<bool>,
|
|
29
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
30
|
+
pub width: Option<i64>,
|
|
31
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
32
|
+
pub height: Option<i64>,
|
|
33
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
34
|
+
pub summary: Option<String>,
|
|
80
35
|
}
|
|
81
36
|
|
|
82
|
-
/// Extract PDF-specific metadata from raw bytes.
|
|
83
|
-
///
|
|
84
|
-
/// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
|
|
85
37
|
pub fn extract_metadata(pdf_bytes: &[u8]) -> Result<PdfMetadata> {
|
|
86
38
|
extract_metadata_with_password(pdf_bytes, None)
|
|
87
39
|
}
|
|
88
40
|
|
|
89
|
-
/// Extract PDF-specific metadata from raw bytes with optional password.
|
|
90
|
-
///
|
|
91
|
-
/// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
|
|
92
41
|
pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<PdfMetadata> {
|
|
93
|
-
let
|
|
42
|
+
let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
43
|
+
.or_else(|_| Pdfium::bind_to_system_library())
|
|
44
|
+
.map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
45
|
+
|
|
46
|
+
let pdfium = Pdfium::new(bindings);
|
|
94
47
|
|
|
95
48
|
let document = pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
96
|
-
let err_msg =
|
|
49
|
+
let err_msg = e.to_string();
|
|
97
50
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
98
51
|
PdfError::InvalidPassword
|
|
99
52
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -103,7 +56,7 @@ pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>)
|
|
|
103
56
|
}
|
|
104
57
|
})?;
|
|
105
58
|
|
|
106
|
-
|
|
59
|
+
extract_metadata_from_document(&document)
|
|
107
60
|
}
|
|
108
61
|
|
|
109
62
|
pub fn extract_metadata_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<PdfMetadata> {
|
|
@@ -126,215 +79,70 @@ pub fn extract_metadata_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) ->
|
|
|
126
79
|
extract_metadata(pdf_bytes)
|
|
127
80
|
}
|
|
128
81
|
|
|
129
|
-
|
|
130
|
-
///
|
|
131
|
-
/// Extracts common fields (title, subject, authors, keywords, dates, creator),
|
|
132
|
-
/// PDF-specific metadata, and optionally builds a PageStructure with boundaries.
|
|
133
|
-
///
|
|
134
|
-
/// # Arguments
|
|
135
|
-
///
|
|
136
|
-
/// * `document` - The PDF document to extract metadata from
|
|
137
|
-
/// * `page_boundaries` - Optional vector of PageBoundary entries for building PageStructure.
|
|
138
|
-
/// If provided, a PageStructure will be built with these boundaries.
|
|
139
|
-
///
|
|
140
|
-
/// # Returns
|
|
141
|
-
///
|
|
142
|
-
/// Returns a `PdfExtractionMetadata` struct containing all extracted metadata,
|
|
143
|
-
/// including page structure if boundaries were provided.
|
|
144
|
-
pub fn extract_metadata_from_document(
|
|
145
|
-
document: &PdfDocument<'_>,
|
|
146
|
-
page_boundaries: Option<&[PageBoundary]>,
|
|
147
|
-
) -> Result<PdfExtractionMetadata> {
|
|
148
|
-
extract_metadata_from_document_impl(document, page_boundaries)
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/// Internal implementation of metadata extraction that can be reused by unified extraction.
|
|
152
|
-
pub(crate) fn extract_metadata_from_document_impl(
|
|
153
|
-
document: &PdfDocument<'_>,
|
|
154
|
-
page_boundaries: Option<&[PageBoundary]>,
|
|
155
|
-
) -> Result<PdfExtractionMetadata> {
|
|
156
|
-
let pdf_specific = extract_pdf_specific_metadata(document)?;
|
|
157
|
-
|
|
158
|
-
let common = extract_common_metadata_from_document(document)?;
|
|
159
|
-
|
|
160
|
-
let page_structure = if let Some(boundaries) = page_boundaries {
|
|
161
|
-
Some(build_page_structure(document, boundaries)?)
|
|
162
|
-
} else {
|
|
163
|
-
None
|
|
164
|
-
};
|
|
165
|
-
|
|
166
|
-
Ok(PdfExtractionMetadata {
|
|
167
|
-
title: common.title,
|
|
168
|
-
subject: common.subject,
|
|
169
|
-
authors: common.authors,
|
|
170
|
-
keywords: common.keywords,
|
|
171
|
-
created_at: common.created_at,
|
|
172
|
-
modified_at: common.modified_at,
|
|
173
|
-
created_by: common.created_by,
|
|
174
|
-
pdf_specific,
|
|
175
|
-
page_structure,
|
|
176
|
-
})
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
/// Extract PDF-specific metadata from a document.
|
|
180
|
-
///
|
|
181
|
-
/// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
|
|
182
|
-
fn extract_pdf_specific_metadata(document: &PdfDocument<'_>) -> Result<PdfMetadata> {
|
|
82
|
+
pub(crate) fn extract_metadata_from_document(document: &PdfDocument<'_>) -> Result<PdfMetadata> {
|
|
183
83
|
let pdf_metadata = document.metadata();
|
|
184
84
|
|
|
185
85
|
let mut metadata = PdfMetadata {
|
|
186
86
|
pdf_version: format_pdf_version(document.version()),
|
|
187
87
|
..Default::default()
|
|
188
88
|
};
|
|
189
|
-
|
|
89
|
+
metadata.page_count = Some(document.pages().len() as usize);
|
|
190
90
|
metadata.is_encrypted = document
|
|
191
91
|
.permissions()
|
|
192
92
|
.security_handler_revision()
|
|
193
93
|
.ok()
|
|
194
94
|
.map(|revision| revision != PdfSecurityHandlerRevision::Unprotected);
|
|
195
95
|
|
|
196
|
-
metadata.
|
|
197
|
-
.get(PdfDocumentMetadataTagType::
|
|
96
|
+
metadata.title = pdf_metadata
|
|
97
|
+
.get(PdfDocumentMetadataTagType::Title)
|
|
198
98
|
.map(|tag| tag.value().to_string());
|
|
199
99
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
metadata.width = Some(page_rect.width().value.round() as i64);
|
|
204
|
-
metadata.height = Some(page_rect.height().value.round() as i64);
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
// Always capture page count
|
|
208
|
-
metadata.page_count = Some(document.pages().len() as usize);
|
|
209
|
-
|
|
210
|
-
Ok(metadata)
|
|
211
|
-
}
|
|
100
|
+
metadata.subject = pdf_metadata
|
|
101
|
+
.get(PdfDocumentMetadataTagType::Subject)
|
|
102
|
+
.map(|tag| tag.value().to_string());
|
|
212
103
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
/// - Character offset boundaries for each page
|
|
219
|
-
/// - Optional per-page metadata with dimensions
|
|
220
|
-
///
|
|
221
|
-
/// # Validation
|
|
222
|
-
///
|
|
223
|
-
/// - Boundaries must not be empty
|
|
224
|
-
/// - Boundary count must match the document's page count
|
|
225
|
-
fn build_page_structure(document: &PdfDocument<'_>, boundaries: &[PageBoundary]) -> Result<PageStructure> {
|
|
226
|
-
let total_count = document.pages().len() as usize;
|
|
227
|
-
|
|
228
|
-
if boundaries.is_empty() {
|
|
229
|
-
return Err(PdfError::MetadataExtractionFailed(
|
|
230
|
-
"No page boundaries provided for PageStructure".to_string(),
|
|
231
|
-
));
|
|
104
|
+
if let Some(author_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::Author) {
|
|
105
|
+
let authors = parse_authors(author_tag.value());
|
|
106
|
+
if !authors.is_empty() {
|
|
107
|
+
metadata.authors = Some(authors);
|
|
108
|
+
}
|
|
232
109
|
}
|
|
233
110
|
|
|
234
|
-
if
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
)));
|
|
111
|
+
if let Some(keywords_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::Keywords) {
|
|
112
|
+
let keywords = parse_keywords(keywords_tag.value());
|
|
113
|
+
if !keywords.is_empty() {
|
|
114
|
+
metadata.keywords = Some(keywords);
|
|
115
|
+
}
|
|
240
116
|
}
|
|
241
117
|
|
|
242
|
-
let
|
|
243
|
-
|
|
244
|
-
let page_number = boundary.page_number;
|
|
245
|
-
|
|
246
|
-
let dimensions = if let Ok(page_rect) = document.pages().page_size(index as i32) {
|
|
247
|
-
Some((page_rect.width().value as f64, page_rect.height().value as f64))
|
|
248
|
-
} else {
|
|
249
|
-
None
|
|
250
|
-
};
|
|
251
|
-
|
|
252
|
-
pages.push(PageInfo {
|
|
253
|
-
number: page_number,
|
|
254
|
-
title: None,
|
|
255
|
-
dimensions,
|
|
256
|
-
image_count: None,
|
|
257
|
-
table_count: None,
|
|
258
|
-
hidden: None,
|
|
259
|
-
});
|
|
118
|
+
if let Some(created_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::CreationDate) {
|
|
119
|
+
metadata.created_at = Some(parse_pdf_date(created_tag.value()));
|
|
260
120
|
}
|
|
261
121
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
unit_type: PageUnitType::Page,
|
|
265
|
-
boundaries: Some(boundaries.to_vec()),
|
|
266
|
-
pages: if pages.is_empty() { None } else { Some(pages) },
|
|
267
|
-
})
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
/// Extract common metadata from a PDF document.
|
|
271
|
-
///
|
|
272
|
-
/// Returns common fields (title, authors, keywords, dates) that are now stored
|
|
273
|
-
/// in the base `Metadata` struct instead of format-specific metadata.
|
|
274
|
-
///
|
|
275
|
-
/// This function uses batch fetching with caching to optimize metadata extraction
|
|
276
|
-
/// by reducing repeated dictionary lookups. All metadata tags are fetched once and
|
|
277
|
-
/// cached in a single pass.
|
|
278
|
-
pub fn extract_common_metadata_from_document(document: &PdfDocument<'_>) -> Result<CommonPdfMetadata> {
|
|
279
|
-
let pdf_metadata = document.metadata();
|
|
280
|
-
|
|
281
|
-
let tag_types = [
|
|
282
|
-
PdfDocumentMetadataTagType::Title,
|
|
283
|
-
PdfDocumentMetadataTagType::Subject,
|
|
284
|
-
PdfDocumentMetadataTagType::Author,
|
|
285
|
-
PdfDocumentMetadataTagType::Keywords,
|
|
286
|
-
PdfDocumentMetadataTagType::CreationDate,
|
|
287
|
-
PdfDocumentMetadataTagType::ModificationDate,
|
|
288
|
-
PdfDocumentMetadataTagType::Creator,
|
|
289
|
-
];
|
|
290
|
-
|
|
291
|
-
let mut metadata_cache: [Option<String>; 7] = Default::default();
|
|
292
|
-
for (index, tag_type) in tag_types.iter().enumerate() {
|
|
293
|
-
if let Some(tag) = pdf_metadata.get(*tag_type) {
|
|
294
|
-
metadata_cache[index] = Some(tag.value().to_string());
|
|
295
|
-
}
|
|
122
|
+
if let Some(modified_tag) = pdf_metadata.get(PdfDocumentMetadataTagType::ModificationDate) {
|
|
123
|
+
metadata.modified_at = Some(parse_pdf_date(modified_tag.value()));
|
|
296
124
|
}
|
|
297
125
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
let authors = metadata_cache[2]
|
|
303
|
-
.as_ref()
|
|
304
|
-
.map(|author_str| parse_authors(author_str))
|
|
305
|
-
.and_then(|parsed| if !parsed.is_empty() { Some(parsed) } else { None });
|
|
306
|
-
|
|
307
|
-
let keywords = metadata_cache[3]
|
|
308
|
-
.as_ref()
|
|
309
|
-
.map(|keywords_str| parse_keywords(keywords_str))
|
|
310
|
-
.and_then(|parsed| if !parsed.is_empty() { Some(parsed) } else { None });
|
|
311
|
-
|
|
312
|
-
let created_at = metadata_cache[4].as_ref().map(|date_str| parse_pdf_date(date_str));
|
|
126
|
+
metadata.created_by = pdf_metadata
|
|
127
|
+
.get(PdfDocumentMetadataTagType::Creator)
|
|
128
|
+
.map(|tag| tag.value().to_string());
|
|
313
129
|
|
|
314
|
-
|
|
130
|
+
metadata.producer = pdf_metadata
|
|
131
|
+
.get(PdfDocumentMetadataTagType::Producer)
|
|
132
|
+
.map(|tag| tag.value().to_string());
|
|
315
133
|
|
|
316
|
-
|
|
134
|
+
if !document.pages().is_empty()
|
|
135
|
+
&& let Ok(page_rect) = document.pages().page_size(0)
|
|
136
|
+
{
|
|
137
|
+
metadata.width = Some(page_rect.width().value.round() as i64);
|
|
138
|
+
metadata.height = Some(page_rect.height().value.round() as i64);
|
|
139
|
+
}
|
|
317
140
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
authors,
|
|
322
|
-
keywords,
|
|
323
|
-
created_at,
|
|
324
|
-
modified_at,
|
|
325
|
-
created_by,
|
|
326
|
-
})
|
|
327
|
-
}
|
|
141
|
+
if metadata.summary.is_none() {
|
|
142
|
+
metadata.summary = Some(generate_summary(&metadata));
|
|
143
|
+
}
|
|
328
144
|
|
|
329
|
-
|
|
330
|
-
pub struct CommonPdfMetadata {
|
|
331
|
-
pub title: Option<String>,
|
|
332
|
-
pub subject: Option<String>,
|
|
333
|
-
pub authors: Option<Vec<String>>,
|
|
334
|
-
pub keywords: Option<Vec<String>>,
|
|
335
|
-
pub created_at: Option<String>,
|
|
336
|
-
pub modified_at: Option<String>,
|
|
337
|
-
pub created_by: Option<String>,
|
|
145
|
+
Ok(metadata)
|
|
338
146
|
}
|
|
339
147
|
|
|
340
148
|
fn parse_authors(author_str: &str) -> Vec<String> {
|
|
@@ -398,6 +206,25 @@ fn parse_pdf_date(date_str: &str) -> String {
|
|
|
398
206
|
}
|
|
399
207
|
}
|
|
400
208
|
|
|
209
|
+
fn generate_summary(metadata: &PdfMetadata) -> String {
|
|
210
|
+
let mut parts = Vec::new();
|
|
211
|
+
|
|
212
|
+
if let Some(page_count) = metadata.page_count {
|
|
213
|
+
let plural = if page_count != 1 { "s" } else { "" };
|
|
214
|
+
parts.push(format!("PDF document with {} page{}.", page_count, plural));
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if let Some(ref version) = metadata.pdf_version {
|
|
218
|
+
parts.push(format!("PDF version {}.", version));
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if metadata.is_encrypted == Some(true) {
|
|
222
|
+
parts.push("Document is encrypted.".to_string());
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
parts.join(" ")
|
|
226
|
+
}
|
|
227
|
+
|
|
401
228
|
fn format_pdf_version(version: PdfDocumentVersion) -> Option<String> {
|
|
402
229
|
match version {
|
|
403
230
|
PdfDocumentVersion::Unset => None,
|
|
@@ -485,25 +312,35 @@ mod tests {
|
|
|
485
312
|
}
|
|
486
313
|
|
|
487
314
|
#[test]
|
|
488
|
-
fn
|
|
489
|
-
let
|
|
490
|
-
|
|
315
|
+
fn test_generate_summary() {
|
|
316
|
+
let metadata = PdfMetadata {
|
|
317
|
+
page_count: Some(10),
|
|
318
|
+
pdf_version: Some("1.7".to_string()),
|
|
319
|
+
is_encrypted: Some(false),
|
|
320
|
+
..Default::default()
|
|
321
|
+
};
|
|
322
|
+
|
|
323
|
+
let summary = generate_summary(&metadata);
|
|
324
|
+
assert!(summary.contains("10 pages"));
|
|
325
|
+
assert!(summary.contains("1.7"));
|
|
326
|
+
assert!(!summary.contains("encrypted"));
|
|
491
327
|
}
|
|
492
328
|
|
|
493
329
|
#[test]
|
|
494
|
-
fn
|
|
495
|
-
let
|
|
496
|
-
|
|
330
|
+
fn test_generate_summary_single_page() {
|
|
331
|
+
let metadata = PdfMetadata {
|
|
332
|
+
page_count: Some(1),
|
|
333
|
+
..Default::default()
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
let summary = generate_summary(&metadata);
|
|
337
|
+
assert!(summary.contains("1 page."));
|
|
338
|
+
assert!(!summary.contains("pages"));
|
|
497
339
|
}
|
|
498
340
|
|
|
499
341
|
#[test]
|
|
500
|
-
fn
|
|
501
|
-
let
|
|
502
|
-
|
|
503
|
-
let error_msg = format!(
|
|
504
|
-
"Boundary count {} doesn't match page count {}",
|
|
505
|
-
boundaries_count, page_count
|
|
506
|
-
);
|
|
507
|
-
assert_eq!(error_msg, "Boundary count 3 doesn't match page count 5");
|
|
342
|
+
fn test_extract_metadata_invalid_pdf() {
|
|
343
|
+
let result = extract_metadata(b"not a pdf");
|
|
344
|
+
assert!(result.is_err());
|
|
508
345
|
}
|
|
509
346
|
}
|
|
@@ -26,56 +26,25 @@
|
|
|
26
26
|
//!
|
|
27
27
|
//! // Extract metadata
|
|
28
28
|
//! let metadata = extract_metadata(&pdf_bytes)?;
|
|
29
|
-
//! println!("
|
|
29
|
+
//! println!("Page count: {:?}", metadata.page_count);
|
|
30
30
|
//! # Ok(())
|
|
31
31
|
//! # }
|
|
32
32
|
//! ```
|
|
33
33
|
//!
|
|
34
34
|
//! # Note
|
|
35
35
|
//!
|
|
36
|
-
//! This module
|
|
36
|
+
//! This module is always available. The `ocr` feature enables additional
|
|
37
37
|
//! functionality in the PDF extractor for rendering pages to images.
|
|
38
|
-
#[cfg(feature = "pdf")]
|
|
39
|
-
pub(crate) mod bindings;
|
|
40
|
-
#[cfg(all(feature = "pdf", feature = "bundled-pdfium"))]
|
|
41
|
-
pub mod bundled;
|
|
42
|
-
#[cfg(feature = "pdf")]
|
|
43
38
|
pub mod error;
|
|
44
|
-
#[cfg(feature = "pdf")]
|
|
45
|
-
pub mod fonts;
|
|
46
|
-
#[cfg(feature = "pdf")]
|
|
47
|
-
pub mod hierarchy;
|
|
48
|
-
#[cfg(feature = "pdf")]
|
|
49
39
|
pub mod images;
|
|
50
|
-
#[cfg(feature = "pdf")]
|
|
51
40
|
pub mod metadata;
|
|
52
|
-
#[cfg(feature = "pdf")]
|
|
53
41
|
pub mod rendering;
|
|
54
|
-
#[cfg(feature = "pdf")]
|
|
55
42
|
pub mod table;
|
|
56
|
-
#[cfg(feature = "pdf")]
|
|
57
43
|
pub mod text;
|
|
58
44
|
|
|
59
|
-
#[cfg(feature = "pdf")]
|
|
60
|
-
pub use crate::core::config::HierarchyConfig;
|
|
61
|
-
#[cfg(all(feature = "pdf", feature = "bundled-pdfium"))]
|
|
62
|
-
pub use bundled::extract_bundled_pdfium;
|
|
63
|
-
#[cfg(feature = "pdf")]
|
|
64
45
|
pub use error::PdfError;
|
|
65
|
-
#[cfg(feature = "pdf")]
|
|
66
|
-
pub use fonts::{cached_font_count, get_font_descriptors, initialize_font_cache};
|
|
67
|
-
#[cfg(feature = "pdf")]
|
|
68
|
-
pub use hierarchy::{
|
|
69
|
-
BoundingBox, CharData, FontSizeCluster, HierarchyLevel, TextBlock, assign_hierarchy_levels,
|
|
70
|
-
assign_hierarchy_levels_from_clusters, cluster_font_sizes, extract_chars_with_fonts, should_trigger_ocr,
|
|
71
|
-
};
|
|
72
|
-
#[cfg(feature = "pdf")]
|
|
73
46
|
pub use images::{PdfImage, PdfImageExtractor, extract_images_from_pdf};
|
|
74
|
-
#[cfg(feature = "pdf")]
|
|
75
47
|
pub use metadata::extract_metadata;
|
|
76
|
-
#[cfg(feature = "pdf")]
|
|
77
48
|
pub use rendering::{PageRenderOptions, render_page_to_image};
|
|
78
|
-
#[cfg(feature = "pdf")]
|
|
79
49
|
pub use table::extract_words_from_page;
|
|
80
|
-
#[cfg(feature = "pdf")]
|
|
81
50
|
pub use text::extract_text_from_pdf;
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
use super::bindings::{PdfiumHandle, bind_pdfium};
|
|
2
1
|
use super::error::{PdfError, Result};
|
|
3
2
|
use image::DynamicImage;
|
|
4
3
|
use pdfium_render::prelude::*;
|
|
@@ -27,18 +26,20 @@ impl Default for PageRenderOptions {
|
|
|
27
26
|
}
|
|
28
27
|
}
|
|
29
28
|
|
|
30
|
-
pub struct PdfRenderer
|
|
31
|
-
pdfium:
|
|
29
|
+
pub struct PdfRenderer {
|
|
30
|
+
pdfium: Pdfium,
|
|
32
31
|
}
|
|
33
32
|
|
|
34
|
-
impl PdfRenderer
|
|
33
|
+
impl PdfRenderer {
|
|
35
34
|
pub fn new() -> Result<Self> {
|
|
36
|
-
let
|
|
35
|
+
let binding = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
36
|
+
.or_else(|_| Pdfium::bind_to_system_library())
|
|
37
|
+
.map_err(|e| PdfError::RenderingFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
38
|
+
|
|
39
|
+
let pdfium = Pdfium::new(binding);
|
|
37
40
|
Ok(Self { pdfium })
|
|
38
41
|
}
|
|
39
|
-
}
|
|
40
42
|
|
|
41
|
-
impl PdfRenderer<'_> {
|
|
42
43
|
pub fn render_page_to_image(
|
|
43
44
|
&self,
|
|
44
45
|
pdf_bytes: &[u8],
|
|
@@ -56,7 +57,7 @@ impl PdfRenderer<'_> {
|
|
|
56
57
|
password: Option<&str>,
|
|
57
58
|
) -> Result<DynamicImage> {
|
|
58
59
|
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
59
|
-
let err_msg =
|
|
60
|
+
let err_msg = e.to_string();
|
|
60
61
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
61
62
|
PdfError::InvalidPassword
|
|
62
63
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -68,7 +69,7 @@ impl PdfRenderer<'_> {
|
|
|
68
69
|
|
|
69
70
|
let page = document
|
|
70
71
|
.pages()
|
|
71
|
-
.get(page_index as
|
|
72
|
+
.get(page_index as u16)
|
|
72
73
|
.map_err(|_| PdfError::PageNotFound(page_index))?;
|
|
73
74
|
|
|
74
75
|
let width_points = page.width().value;
|
|
@@ -114,7 +115,7 @@ impl PdfRenderer<'_> {
|
|
|
114
115
|
password: Option<&str>,
|
|
115
116
|
) -> Result<Vec<DynamicImage>> {
|
|
116
117
|
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
|
|
117
|
-
let err_msg =
|
|
118
|
+
let err_msg = e.to_string();
|
|
118
119
|
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
|
|
119
120
|
PdfError::InvalidPassword
|
|
120
121
|
} else if err_msg.contains("password") || err_msg.contains("Password") {
|
|
@@ -237,8 +238,7 @@ mod tests {
|
|
|
237
238
|
|
|
238
239
|
#[test]
|
|
239
240
|
fn test_renderer_size() {
|
|
240
|
-
|
|
241
|
-
let _size = size_of::<PdfRenderer>();
|
|
241
|
+
assert!(size_of::<PdfRenderer>() > 0);
|
|
242
242
|
}
|
|
243
243
|
|
|
244
244
|
#[test]
|