kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -3,28 +3,19 @@
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
|
-
use crate::types::{ExtractionResult, Metadata
|
|
6
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
7
7
|
use async_trait::async_trait;
|
|
8
|
-
#[cfg(feature = "tokio-runtime")]
|
|
9
8
|
use std::path::Path;
|
|
10
9
|
|
|
11
10
|
#[cfg(feature = "pdf")]
|
|
12
11
|
use crate::pdf::error::PdfError;
|
|
13
12
|
#[cfg(feature = "ocr")]
|
|
14
13
|
use crate::pdf::rendering::{PageRenderOptions, PdfRenderer};
|
|
15
|
-
#[cfg(feature = "pdf")]
|
|
14
|
+
#[cfg(all(feature = "pdf", feature = "ocr"))]
|
|
16
15
|
use crate::types::Table;
|
|
17
16
|
#[cfg(feature = "pdf")]
|
|
18
17
|
use pdfium_render::prelude::*;
|
|
19
18
|
|
|
20
|
-
#[cfg(feature = "pdf")]
|
|
21
|
-
type PdfExtractionPhaseResult = (
|
|
22
|
-
crate::pdf::metadata::PdfExtractionMetadata,
|
|
23
|
-
String,
|
|
24
|
-
Vec<Table>,
|
|
25
|
-
Option<Vec<PageContent>>,
|
|
26
|
-
);
|
|
27
|
-
|
|
28
19
|
#[cfg(feature = "ocr")]
|
|
29
20
|
const MIN_TOTAL_NON_WHITESPACE: usize = 64;
|
|
30
21
|
#[cfg(feature = "ocr")]
|
|
@@ -146,37 +137,41 @@ fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) ->
|
|
|
146
137
|
///
|
|
147
138
|
/// This function converts PDF character positions to HocrWord format,
|
|
148
139
|
/// then uses the existing table reconstruction logic to detect tables.
|
|
149
|
-
///
|
|
150
|
-
/// Uses the shared PdfDocument reference (wrapped in Arc<RwLock<>> for thread-safety).
|
|
151
140
|
#[cfg(all(feature = "pdf", feature = "ocr"))]
|
|
152
141
|
fn extract_tables_from_document(
|
|
153
142
|
document: &PdfDocument,
|
|
154
|
-
_metadata: &crate::pdf::metadata::
|
|
143
|
+
_metadata: &crate::pdf::metadata::PdfMetadata,
|
|
155
144
|
) -> Result<Vec<Table>> {
|
|
156
145
|
use crate::ocr::table::{reconstruct_table, table_to_markdown};
|
|
157
146
|
use crate::pdf::table::extract_words_from_page;
|
|
158
147
|
|
|
159
148
|
let mut all_tables = Vec::new();
|
|
160
149
|
|
|
150
|
+
// Process each page
|
|
161
151
|
for (page_index, page) in document.pages().iter().enumerate() {
|
|
162
|
-
|
|
152
|
+
// Extract words with positions from the page
|
|
153
|
+
let words = extract_words_from_page(&page, 0.0)?; // Use 0.0 confidence for PDF (always high quality)
|
|
163
154
|
|
|
164
155
|
if words.is_empty() {
|
|
165
156
|
continue;
|
|
166
157
|
}
|
|
167
158
|
|
|
159
|
+
// Use existing table reconstruction logic
|
|
160
|
+
// These thresholds match the defaults from TesseractConfig
|
|
168
161
|
let column_threshold = 50;
|
|
169
162
|
let row_threshold_ratio = 0.5;
|
|
170
163
|
|
|
171
|
-
|
|
164
|
+
// Reconstruct table from positioned words
|
|
165
|
+
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio, true);
|
|
172
166
|
|
|
173
167
|
if !table_cells.is_empty() {
|
|
168
|
+
// Generate markdown representation
|
|
174
169
|
let markdown = table_to_markdown(&table_cells);
|
|
175
170
|
|
|
176
171
|
all_tables.push(Table {
|
|
177
172
|
cells: table_cells,
|
|
178
173
|
markdown,
|
|
179
|
-
page_number: page_index + 1,
|
|
174
|
+
page_number: page_index + 1, // 1-indexed
|
|
180
175
|
});
|
|
181
176
|
}
|
|
182
177
|
}
|
|
@@ -188,47 +183,11 @@ fn extract_tables_from_document(
|
|
|
188
183
|
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
|
|
189
184
|
fn extract_tables_from_document(
|
|
190
185
|
_document: &PdfDocument,
|
|
191
|
-
_metadata: &crate::pdf::metadata::
|
|
186
|
+
_metadata: &crate::pdf::metadata::PdfMetadata,
|
|
192
187
|
) -> Result<Vec<crate::types::Table>> {
|
|
193
188
|
Ok(vec![])
|
|
194
189
|
}
|
|
195
190
|
|
|
196
|
-
/// Helper function to assign tables and images to pages.
|
|
197
|
-
///
|
|
198
|
-
/// If page_contents is None, returns None (no per-page tracking enabled).
|
|
199
|
-
/// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
|
|
200
|
-
///
|
|
201
|
-
/// # Performance
|
|
202
|
-
///
|
|
203
|
-
/// Uses Arc::new to wrap tables and images, avoiding expensive copies.
|
|
204
|
-
/// This reduces memory overhead by enabling zero-copy sharing of table/image data
|
|
205
|
-
/// across multiple references (e.g., when the same table appears on multiple pages).
|
|
206
|
-
fn assign_tables_and_images_to_pages(
|
|
207
|
-
mut page_contents: Option<Vec<PageContent>>,
|
|
208
|
-
tables: &[crate::types::Table],
|
|
209
|
-
images: &[crate::types::ExtractedImage],
|
|
210
|
-
) -> Option<Vec<PageContent>> {
|
|
211
|
-
let pages = page_contents.take()?;
|
|
212
|
-
|
|
213
|
-
let mut updated_pages = pages;
|
|
214
|
-
|
|
215
|
-
for table in tables {
|
|
216
|
-
if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
|
|
217
|
-
page.tables.push(std::sync::Arc::new(table.clone()));
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
for image in images {
|
|
222
|
-
if let Some(page_num) = image.page_number
|
|
223
|
-
&& let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
|
|
224
|
-
{
|
|
225
|
-
page.images.push(std::sync::Arc::new(image.clone()));
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
Some(updated_pages)
|
|
230
|
-
}
|
|
231
|
-
|
|
232
191
|
/// PDF document extractor using pypdfium2 and playa-pdf.
|
|
233
192
|
pub struct PdfExtractor;
|
|
234
193
|
|
|
@@ -243,41 +202,6 @@ impl PdfExtractor {
|
|
|
243
202
|
Self
|
|
244
203
|
}
|
|
245
204
|
|
|
246
|
-
/// Extract text, metadata, and tables from a PDF document using a single shared instance.
|
|
247
|
-
///
|
|
248
|
-
/// This method consolidates all PDF extraction phases (text, metadata, tables) into a single
|
|
249
|
-
/// operation using a single PdfDocument instance. This avoids redundant document parsing
|
|
250
|
-
/// and pdfium initialization overhead.
|
|
251
|
-
///
|
|
252
|
-
/// # Performance
|
|
253
|
-
///
|
|
254
|
-
/// By reusing a single document instance across all extraction phases, we eliminate:
|
|
255
|
-
/// - Duplicate document parsing overhead (25-40ms saved)
|
|
256
|
-
/// - Redundant pdfium bindings initialization
|
|
257
|
-
/// - Multiple page tree traversals
|
|
258
|
-
///
|
|
259
|
-
/// Expected improvement: 20-30% faster PDF processing.
|
|
260
|
-
///
|
|
261
|
-
/// # Returns
|
|
262
|
-
///
|
|
263
|
-
/// A tuple containing:
|
|
264
|
-
/// - PDF metadata (title, authors, dates, page structure, etc.)
|
|
265
|
-
/// - Native extracted text (or empty if using OCR)
|
|
266
|
-
/// - Extracted tables (if OCR feature enabled)
|
|
267
|
-
/// - Per-page content (if page extraction configured)
|
|
268
|
-
#[cfg(feature = "pdf")]
|
|
269
|
-
fn extract_all_from_document(
|
|
270
|
-
document: &PdfDocument,
|
|
271
|
-
config: &ExtractionConfig,
|
|
272
|
-
) -> Result<PdfExtractionPhaseResult> {
|
|
273
|
-
let (native_text, _boundaries, page_contents, pdf_metadata) =
|
|
274
|
-
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
|
|
275
|
-
|
|
276
|
-
let tables = extract_tables_from_document(document, &pdf_metadata)?;
|
|
277
|
-
|
|
278
|
-
Ok((pdf_metadata, native_text, tables, page_contents))
|
|
279
|
-
}
|
|
280
|
-
|
|
281
205
|
/// Extract text from PDF using OCR.
|
|
282
206
|
///
|
|
283
207
|
/// Renders all pages to images and processes them with OCR.
|
|
@@ -363,13 +287,6 @@ impl Plugin for PdfExtractor {
|
|
|
363
287
|
|
|
364
288
|
#[async_trait]
|
|
365
289
|
impl DocumentExtractor for PdfExtractor {
|
|
366
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
367
|
-
skip(self, content, config),
|
|
368
|
-
fields(
|
|
369
|
-
extractor.name = self.name(),
|
|
370
|
-
content.size_bytes = content.len(),
|
|
371
|
-
)
|
|
372
|
-
))]
|
|
373
290
|
async fn extract_bytes(
|
|
374
291
|
&self,
|
|
375
292
|
content: &[u8],
|
|
@@ -377,26 +294,18 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
377
294
|
config: &ExtractionConfig,
|
|
378
295
|
) -> Result<ExtractionResult> {
|
|
379
296
|
#[cfg(feature = "pdf")]
|
|
380
|
-
let (pdf_metadata, native_text, tables
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
}
|
|
393
|
-
} else {
|
|
394
|
-
pdf_err.into()
|
|
395
|
-
}
|
|
396
|
-
})?;
|
|
397
|
-
|
|
398
|
-
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
399
|
-
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
297
|
+
let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
|
|
298
|
+
// Batch mode: Move PDF extraction to blocking thread pool to enable parallelism
|
|
299
|
+
let content_owned = content.to_vec();
|
|
300
|
+
tokio::task::spawn_blocking(move || {
|
|
301
|
+
let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
302
|
+
.or_else(|_| Pdfium::bind_to_system_library())
|
|
303
|
+
.map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
304
|
+
|
|
305
|
+
let pdfium = Pdfium::new(bindings);
|
|
306
|
+
|
|
307
|
+
let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
|
|
308
|
+
let err_msg = e.to_string();
|
|
400
309
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
401
310
|
PdfError::PasswordRequired
|
|
402
311
|
} else {
|
|
@@ -404,79 +313,40 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
404
313
|
}
|
|
405
314
|
})?;
|
|
406
315
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
let (pdf_metadata, native_text, tables, page_contents) =
|
|
431
|
-
Self::extract_all_from_document(&document, &config_owned)?;
|
|
432
|
-
|
|
433
|
-
if let Some(page_cfg) = config_owned.pages.as_ref()
|
|
434
|
-
&& page_cfg.extract_pages
|
|
435
|
-
&& page_contents.is_none()
|
|
436
|
-
{
|
|
437
|
-
return Err(PdfError::ExtractionFailed(
|
|
438
|
-
"Page extraction was configured but no page data was extracted in batch mode"
|
|
439
|
-
.to_string(),
|
|
440
|
-
)
|
|
441
|
-
.into());
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
|
|
445
|
-
})
|
|
446
|
-
.await
|
|
447
|
-
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
316
|
+
let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
|
|
317
|
+
let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
|
|
318
|
+
|
|
319
|
+
// Extract tables from native PDF text (when not using OCR)
|
|
320
|
+
let tables = extract_tables_from_document(&document, &metadata)?;
|
|
321
|
+
|
|
322
|
+
Ok::<_, crate::error::KreuzbergError>((metadata, native_text, tables))
|
|
323
|
+
})
|
|
324
|
+
.await
|
|
325
|
+
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
326
|
+
} else {
|
|
327
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
328
|
+
let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
329
|
+
.or_else(|_| Pdfium::bind_to_system_library())
|
|
330
|
+
.map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
331
|
+
|
|
332
|
+
let pdfium = Pdfium::new(bindings);
|
|
333
|
+
|
|
334
|
+
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
335
|
+
let err_msg = e.to_string();
|
|
336
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
337
|
+
PdfError::PasswordRequired
|
|
448
338
|
} else {
|
|
449
|
-
|
|
450
|
-
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
451
|
-
|
|
452
|
-
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
453
|
-
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
454
|
-
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
455
|
-
PdfError::PasswordRequired
|
|
456
|
-
} else {
|
|
457
|
-
PdfError::InvalidPdf(err_msg)
|
|
458
|
-
}
|
|
459
|
-
})?;
|
|
460
|
-
|
|
461
|
-
Self::extract_all_from_document(&document, config)?
|
|
339
|
+
PdfError::InvalidPdf(err_msg)
|
|
462
340
|
}
|
|
463
|
-
}
|
|
464
|
-
#[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
|
|
465
|
-
{
|
|
466
|
-
let pdfium =
|
|
467
|
-
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
341
|
+
})?;
|
|
468
342
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
472
|
-
PdfError::PasswordRequired
|
|
473
|
-
} else {
|
|
474
|
-
PdfError::InvalidPdf(err_msg)
|
|
475
|
-
}
|
|
476
|
-
})?;
|
|
343
|
+
let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
|
|
344
|
+
let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
|
|
477
345
|
|
|
478
|
-
|
|
479
|
-
|
|
346
|
+
// Extract tables from native PDF text (when not using OCR)
|
|
347
|
+
let tables = extract_tables_from_document(&document, &metadata)?;
|
|
348
|
+
|
|
349
|
+
(metadata, native_text, tables)
|
|
480
350
|
};
|
|
481
351
|
|
|
482
352
|
#[cfg(feature = "ocr")]
|
|
@@ -487,19 +357,20 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
487
357
|
native_text
|
|
488
358
|
}
|
|
489
359
|
} else if config.ocr.is_some() {
|
|
490
|
-
let decision = evaluate_native_text_for_ocr(&native_text,
|
|
360
|
+
let decision = evaluate_native_text_for_ocr(&native_text, pdf_metadata.page_count);
|
|
491
361
|
|
|
492
362
|
if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
|
|
493
363
|
eprintln!(
|
|
494
364
|
"[kreuzberg::pdf::ocr] fallback={} non_whitespace={} alnum={} meaningful_words={} \
|
|
495
|
-
avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3}",
|
|
365
|
+
avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3} pages={}",
|
|
496
366
|
decision.fallback,
|
|
497
367
|
decision.stats.non_whitespace,
|
|
498
368
|
decision.stats.alnum,
|
|
499
369
|
decision.stats.meaningful_words,
|
|
500
370
|
decision.avg_non_whitespace,
|
|
501
371
|
decision.avg_alnum,
|
|
502
|
-
decision.stats.alnum_ratio
|
|
372
|
+
decision.stats.alnum_ratio,
|
|
373
|
+
pdf_metadata.page_count.unwrap_or(0)
|
|
503
374
|
);
|
|
504
375
|
}
|
|
505
376
|
|
|
@@ -515,22 +386,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
515
386
|
#[cfg(not(feature = "ocr"))]
|
|
516
387
|
let text = native_text;
|
|
517
388
|
|
|
518
|
-
|
|
519
|
-
if let Some(ref page_cfg) = config.pages
|
|
520
|
-
&& page_cfg.insert_page_markers
|
|
521
|
-
{
|
|
522
|
-
let marker_placeholder = page_cfg.marker_format.replace("{page_num}", "");
|
|
523
|
-
if !marker_placeholder.is_empty() && !text.contains(&marker_placeholder) {
|
|
524
|
-
#[cfg(feature = "otel")]
|
|
525
|
-
tracing::warn!(
|
|
526
|
-
"Page markers were configured but none found in extracted content. \
|
|
527
|
-
This may indicate very short documents or incomplete extraction."
|
|
528
|
-
);
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
let images = if config.images.as_ref().map(|c| c.extract_images).unwrap_or(false) {
|
|
533
|
-
// Image extraction is enabled, extract images if present
|
|
389
|
+
let images = if config.images.is_some() {
|
|
534
390
|
match crate::pdf::images::extract_images_from_pdf(content) {
|
|
535
391
|
Ok(pdf_images) => Some(
|
|
536
392
|
pdf_images
|
|
@@ -554,41 +410,23 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
554
410
|
})
|
|
555
411
|
.collect(),
|
|
556
412
|
),
|
|
557
|
-
|
|
558
|
-
Err(_) => Some(vec![]),
|
|
413
|
+
Err(_) => None,
|
|
559
414
|
}
|
|
560
415
|
} else {
|
|
561
|
-
// Image extraction is not enabled
|
|
562
416
|
None
|
|
563
417
|
};
|
|
564
418
|
|
|
565
|
-
|
|
419
|
+
// Tables were extracted during metadata/text extraction phase
|
|
420
|
+
// (see extract_tables_from_document function below)
|
|
566
421
|
|
|
567
422
|
Ok(ExtractionResult {
|
|
568
423
|
content: text,
|
|
569
424
|
mime_type: mime_type.to_string(),
|
|
570
425
|
metadata: Metadata {
|
|
571
426
|
#[cfg(feature = "pdf")]
|
|
572
|
-
|
|
573
|
-
#[cfg(feature = "pdf")]
|
|
574
|
-
subject: pdf_metadata.subject.clone(),
|
|
575
|
-
#[cfg(feature = "pdf")]
|
|
576
|
-
authors: pdf_metadata.authors.clone(),
|
|
577
|
-
#[cfg(feature = "pdf")]
|
|
578
|
-
keywords: pdf_metadata.keywords.clone(),
|
|
579
|
-
#[cfg(feature = "pdf")]
|
|
580
|
-
created_at: pdf_metadata.created_at.clone(),
|
|
581
|
-
#[cfg(feature = "pdf")]
|
|
582
|
-
modified_at: pdf_metadata.modified_at.clone(),
|
|
583
|
-
#[cfg(feature = "pdf")]
|
|
584
|
-
created_by: pdf_metadata.created_by.clone(),
|
|
585
|
-
#[cfg(feature = "pdf")]
|
|
586
|
-
pages: pdf_metadata.page_structure.clone(),
|
|
587
|
-
#[cfg(feature = "pdf")]
|
|
588
|
-
format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata.pdf_specific)),
|
|
427
|
+
format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata)),
|
|
589
428
|
..Default::default()
|
|
590
429
|
},
|
|
591
|
-
pages: final_pages,
|
|
592
430
|
tables,
|
|
593
431
|
detected_languages: None,
|
|
594
432
|
chunks: None,
|
|
@@ -596,7 +434,6 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
596
434
|
})
|
|
597
435
|
}
|
|
598
436
|
|
|
599
|
-
#[cfg(feature = "tokio-runtime")]
|
|
600
437
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
601
438
|
let bytes = tokio::fs::read(path).await?;
|
|
602
439
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -656,106 +493,4 @@ mod tests {
|
|
|
656
493
|
let sample = " . , ; : -- -- ";
|
|
657
494
|
assert!(evaluate_native_text_for_ocr(sample, Some(2)).fallback);
|
|
658
495
|
}
|
|
659
|
-
|
|
660
|
-
#[tokio::test]
|
|
661
|
-
#[cfg(feature = "pdf")]
|
|
662
|
-
async fn test_pdf_batch_mode_validates_page_config_enabled() {
|
|
663
|
-
use crate::core::config::PageConfig;
|
|
664
|
-
|
|
665
|
-
let extractor = PdfExtractor::new();
|
|
666
|
-
|
|
667
|
-
let config = ExtractionConfig {
|
|
668
|
-
pages: Some(PageConfig {
|
|
669
|
-
extract_pages: true,
|
|
670
|
-
insert_page_markers: false,
|
|
671
|
-
marker_format: "<!-- PAGE {page_num} -->".to_string(),
|
|
672
|
-
}),
|
|
673
|
-
..Default::default()
|
|
674
|
-
};
|
|
675
|
-
|
|
676
|
-
let pdf_path =
|
|
677
|
-
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
|
|
678
|
-
if let Ok(content) = std::fs::read(pdf_path) {
|
|
679
|
-
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
|
|
680
|
-
assert!(
|
|
681
|
-
result.is_ok(),
|
|
682
|
-
"Failed to extract PDF with page config: {:?}",
|
|
683
|
-
result.err()
|
|
684
|
-
);
|
|
685
|
-
|
|
686
|
-
let extraction_result = result.unwrap();
|
|
687
|
-
assert!(
|
|
688
|
-
extraction_result.pages.is_some(),
|
|
689
|
-
"Pages should be extracted when extract_pages is true"
|
|
690
|
-
);
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
#[tokio::test]
|
|
695
|
-
#[cfg(feature = "pdf")]
|
|
696
|
-
async fn test_pdf_batch_mode_validates_page_config_disabled() {
|
|
697
|
-
let extractor = PdfExtractor::new();
|
|
698
|
-
let config = ExtractionConfig::default();
|
|
699
|
-
|
|
700
|
-
let pdf_path =
|
|
701
|
-
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
|
|
702
|
-
if let Ok(content) = std::fs::read(pdf_path) {
|
|
703
|
-
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
|
|
704
|
-
assert!(
|
|
705
|
-
result.is_ok(),
|
|
706
|
-
"Failed to extract PDF without page config: {:?}",
|
|
707
|
-
result.err()
|
|
708
|
-
);
|
|
709
|
-
|
|
710
|
-
let extraction_result = result.unwrap();
|
|
711
|
-
assert!(
|
|
712
|
-
extraction_result.pages.is_none(),
|
|
713
|
-
"Pages should not be extracted when pages config is None"
|
|
714
|
-
);
|
|
715
|
-
}
|
|
716
|
-
}
|
|
717
|
-
|
|
718
|
-
#[tokio::test]
|
|
719
|
-
#[cfg(feature = "pdf")]
|
|
720
|
-
async fn test_pdf_page_marker_validation() {
|
|
721
|
-
use crate::core::config::PageConfig;
|
|
722
|
-
|
|
723
|
-
let extractor = PdfExtractor::new();
|
|
724
|
-
|
|
725
|
-
let config = ExtractionConfig {
|
|
726
|
-
pages: Some(PageConfig {
|
|
727
|
-
extract_pages: true,
|
|
728
|
-
insert_page_markers: true,
|
|
729
|
-
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
730
|
-
}),
|
|
731
|
-
..Default::default()
|
|
732
|
-
};
|
|
733
|
-
|
|
734
|
-
let pdf_path =
|
|
735
|
-
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/multi_page.pdf");
|
|
736
|
-
if let Ok(content) = std::fs::read(pdf_path) {
|
|
737
|
-
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
|
|
738
|
-
assert!(
|
|
739
|
-
result.is_ok(),
|
|
740
|
-
"Failed to extract PDF with page markers: {:?}",
|
|
741
|
-
result.err()
|
|
742
|
-
);
|
|
743
|
-
|
|
744
|
-
let extraction_result = result.unwrap();
|
|
745
|
-
let marker_placeholder = "<!-- PAGE ";
|
|
746
|
-
if extraction_result.content.len() > 100 {
|
|
747
|
-
assert!(
|
|
748
|
-
extraction_result.content.contains(marker_placeholder),
|
|
749
|
-
"Page markers should be inserted when configured and document has multiple pages"
|
|
750
|
-
);
|
|
751
|
-
}
|
|
752
|
-
}
|
|
753
|
-
}
|
|
754
|
-
|
|
755
|
-
#[test]
|
|
756
|
-
#[cfg(feature = "pdf")]
|
|
757
|
-
fn test_pdf_extractor_without_feature_pdf() {
|
|
758
|
-
let extractor = PdfExtractor::new();
|
|
759
|
-
assert_eq!(extractor.name(), "pdf-extractor");
|
|
760
|
-
}
|
|
761
496
|
}
|