kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -2,22 +2,17 @@
|
|
|
2
2
|
//!
|
|
3
3
|
//! This module converts pdfium character data to HocrWord format,
|
|
4
4
|
//! allowing us to reuse the existing table reconstruction logic.
|
|
5
|
-
//!
|
|
6
|
-
//! Note: Table extraction requires the "ocr" feature and is not available in WASM builds.
|
|
7
5
|
|
|
8
6
|
use super::error::{PdfError, Result};
|
|
9
|
-
#[cfg(feature = "ocr")]
|
|
10
7
|
use crate::ocr::table::HocrWord;
|
|
11
8
|
use pdfium_render::prelude::*;
|
|
12
9
|
|
|
13
10
|
/// Spacing threshold for word boundary detection (in PDF units).
|
|
14
11
|
///
|
|
15
12
|
/// Characters separated by more than this distance are considered separate words.
|
|
16
|
-
#[cfg(feature = "ocr")]
|
|
17
13
|
const WORD_SPACING_THRESHOLD: f32 = 3.0;
|
|
18
14
|
|
|
19
15
|
/// Minimum word length for table detection (filter out noise).
|
|
20
|
-
#[cfg(feature = "ocr")]
|
|
21
16
|
const MIN_WORD_LENGTH: usize = 1;
|
|
22
17
|
|
|
23
18
|
/// Extract words with positions from PDF page for table detection.
|
|
@@ -34,55 +29,37 @@ const MIN_WORD_LENGTH: usize = 1;
|
|
|
34
29
|
///
|
|
35
30
|
/// Vector of HocrWord objects with text and bounding box information.
|
|
36
31
|
///
|
|
37
|
-
/// # Note
|
|
38
|
-
/// This function requires the "ocr" feature to be enabled. Without it, returns an error.
|
|
39
|
-
///
|
|
40
32
|
/// # Example
|
|
41
33
|
///
|
|
42
34
|
/// ```rust,no_run
|
|
43
|
-
/// # #[cfg(feature = "ocr")]
|
|
44
|
-
/// # {
|
|
45
35
|
/// use kreuzberg::pdf::table::extract_words_from_page;
|
|
46
36
|
/// use pdfium_render::prelude::*;
|
|
47
37
|
///
|
|
48
|
-
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
49
38
|
/// let pdfium = Pdfium::default();
|
|
50
39
|
/// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
|
|
51
40
|
/// let page = document.pages().get(0)?;
|
|
52
41
|
/// let words = extract_words_from_page(&page, 90.0)?;
|
|
53
|
-
/// # Ok(())
|
|
54
|
-
/// # }
|
|
55
|
-
/// # }
|
|
56
42
|
/// ```
|
|
57
|
-
#[cfg(feature = "ocr")]
|
|
58
43
|
pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
|
|
44
|
+
// Get page dimensions for coordinate system
|
|
59
45
|
let page_width = page.width().value as i32;
|
|
60
46
|
let page_height = page.height().value as i32;
|
|
61
47
|
|
|
48
|
+
// Get all text from page
|
|
62
49
|
let page_text = page
|
|
63
50
|
.text()
|
|
64
51
|
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
|
|
65
52
|
|
|
53
|
+
// Extract character-level information
|
|
66
54
|
let chars = page_text.chars();
|
|
67
55
|
|
|
56
|
+
// Group characters into words based on spacing
|
|
68
57
|
let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
|
|
69
58
|
|
|
70
59
|
Ok(words)
|
|
71
60
|
}
|
|
72
61
|
|
|
73
|
-
/// Fallback implementation when OCR feature is disabled.
|
|
74
|
-
///
|
|
75
|
-
/// # Errors
|
|
76
|
-
/// Always returns an error indicating that the OCR feature is required.
|
|
77
|
-
#[cfg(not(feature = "ocr"))]
|
|
78
|
-
pub fn extract_words_from_page(_page: &PdfPage, _min_confidence: f64) -> Result<Vec<()>> {
|
|
79
|
-
Err(PdfError::TextExtractionFailed(
|
|
80
|
-
"PDF table extraction requires the 'ocr' feature to be enabled".to_string(),
|
|
81
|
-
))
|
|
82
|
-
}
|
|
83
|
-
|
|
84
62
|
/// Character with position information extracted from PDF.
|
|
85
|
-
#[cfg(feature = "ocr")]
|
|
86
63
|
#[derive(Debug, Clone)]
|
|
87
64
|
struct CharInfo {
|
|
88
65
|
text: char,
|
|
@@ -104,7 +81,6 @@ struct CharInfo {
|
|
|
104
81
|
/// * `page_width` - Page width in PDF units
|
|
105
82
|
/// * `page_height` - Page height in PDF units
|
|
106
83
|
/// * `min_confidence` - Minimum confidence threshold (PDF text uses 95.0)
|
|
107
|
-
#[cfg(feature = "ocr")]
|
|
108
84
|
fn group_chars_into_words(
|
|
109
85
|
chars: PdfPageTextChars,
|
|
110
86
|
_page_width: i32,
|
|
@@ -115,22 +91,26 @@ fn group_chars_into_words(
|
|
|
115
91
|
let mut current_word_chars: Vec<CharInfo> = Vec::new();
|
|
116
92
|
|
|
117
93
|
for pdf_char in chars.iter() {
|
|
94
|
+
// Get character bounds (use loose_bounds for table detection)
|
|
118
95
|
let bounds = pdf_char
|
|
119
96
|
.loose_bounds()
|
|
120
97
|
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
|
|
121
98
|
|
|
99
|
+
// Get unicode character (skip if invalid)
|
|
122
100
|
let Some(ch) = pdf_char.unicode_char() else {
|
|
123
101
|
continue;
|
|
124
102
|
};
|
|
125
103
|
|
|
104
|
+
// Extract character information
|
|
126
105
|
let char_info = CharInfo {
|
|
127
106
|
text: ch,
|
|
128
107
|
x: bounds.left().value,
|
|
129
|
-
y: bounds.bottom().value,
|
|
108
|
+
y: bounds.bottom().value, // PDF coordinates: bottom-left origin
|
|
130
109
|
width: bounds.width().value,
|
|
131
110
|
height: bounds.height().value,
|
|
132
111
|
};
|
|
133
112
|
|
|
113
|
+
// Skip whitespace characters (they're used for word boundaries)
|
|
134
114
|
if char_info.text.is_whitespace() {
|
|
135
115
|
if !current_word_chars.is_empty() {
|
|
136
116
|
if let Some(word) = finalize_word(¤t_word_chars, page_height, min_confidence) {
|
|
@@ -141,6 +121,7 @@ fn group_chars_into_words(
|
|
|
141
121
|
continue;
|
|
142
122
|
}
|
|
143
123
|
|
|
124
|
+
// Check if this character should start a new word
|
|
144
125
|
if should_start_new_word(¤t_word_chars, &char_info) && !current_word_chars.is_empty() {
|
|
145
126
|
if let Some(word) = finalize_word(¤t_word_chars, page_height, min_confidence) {
|
|
146
127
|
words.push(word);
|
|
@@ -151,10 +132,11 @@ fn group_chars_into_words(
|
|
|
151
132
|
current_word_chars.push(char_info);
|
|
152
133
|
}
|
|
153
134
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
135
|
+
// Finalize last word
|
|
136
|
+
if !current_word_chars.is_empty() {
|
|
137
|
+
if let Some(word) = finalize_word(¤t_word_chars, page_height, min_confidence) {
|
|
138
|
+
words.push(word);
|
|
139
|
+
}
|
|
158
140
|
}
|
|
159
141
|
|
|
160
142
|
Ok(words)
|
|
@@ -164,7 +146,6 @@ fn group_chars_into_words(
|
|
|
164
146
|
///
|
|
165
147
|
/// Returns true if the character is far from the previous character
|
|
166
148
|
/// (indicating a word boundary) or on a different line.
|
|
167
|
-
#[cfg(feature = "ocr")]
|
|
168
149
|
fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -> bool {
|
|
169
150
|
if current_word_chars.is_empty() {
|
|
170
151
|
return false;
|
|
@@ -172,11 +153,13 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
|
|
|
172
153
|
|
|
173
154
|
let last_char = ¤t_word_chars[current_word_chars.len() - 1];
|
|
174
155
|
|
|
156
|
+
// Check vertical distance (different lines)
|
|
175
157
|
let vertical_distance = (new_char.y - last_char.y).abs();
|
|
176
158
|
if vertical_distance > last_char.height * 0.5 {
|
|
177
159
|
return true;
|
|
178
160
|
}
|
|
179
161
|
|
|
162
|
+
// Check horizontal distance (word spacing)
|
|
180
163
|
let horizontal_gap = new_char.x - (last_char.x + last_char.width);
|
|
181
164
|
horizontal_gap > WORD_SPACING_THRESHOLD
|
|
182
165
|
}
|
|
@@ -185,43 +168,51 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
|
|
|
185
168
|
///
|
|
186
169
|
/// Calculates bounding box and confidence for the word.
|
|
187
170
|
/// Returns None if the word doesn't meet minimum criteria.
|
|
188
|
-
#[cfg(feature = "ocr")]
|
|
189
171
|
fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> Option<HocrWord> {
|
|
190
172
|
if chars.is_empty() {
|
|
191
173
|
return None;
|
|
192
174
|
}
|
|
193
175
|
|
|
176
|
+
// Build word text
|
|
194
177
|
let text: String = chars.iter().map(|c| c.text).collect();
|
|
195
178
|
|
|
196
179
|
if text.len() < MIN_WORD_LENGTH {
|
|
197
180
|
return None;
|
|
198
181
|
}
|
|
199
182
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
(
|
|
214
|
-
|
|
215
|
-
(
|
|
216
|
-
|
|
183
|
+
// Calculate bounding box (encompassing all characters)
|
|
184
|
+
let left = chars
|
|
185
|
+
.iter()
|
|
186
|
+
.map(|c| c.x)
|
|
187
|
+
.min_by(|a, b| a.partial_cmp(b).unwrap())
|
|
188
|
+
.unwrap_or(0.0);
|
|
189
|
+
let right = chars
|
|
190
|
+
.iter()
|
|
191
|
+
.map(|c| c.x + c.width)
|
|
192
|
+
.max_by(|a, b| a.partial_cmp(b).unwrap())
|
|
193
|
+
.unwrap_or(0.0);
|
|
194
|
+
let bottom = chars
|
|
195
|
+
.iter()
|
|
196
|
+
.map(|c| c.y)
|
|
197
|
+
.min_by(|a, b| a.partial_cmp(b).unwrap())
|
|
198
|
+
.unwrap_or(0.0);
|
|
199
|
+
let top = chars
|
|
200
|
+
.iter()
|
|
201
|
+
.map(|c| c.y + c.height)
|
|
202
|
+
.max_by(|a, b| a.partial_cmp(b).unwrap())
|
|
203
|
+
.unwrap_or(0.0);
|
|
217
204
|
|
|
218
205
|
let width = (right - left).round() as i32;
|
|
219
206
|
let height = (top - bottom).round() as i32;
|
|
220
207
|
|
|
208
|
+
// Convert PDF coordinates (bottom-left origin) to image coordinates (top-left origin)
|
|
209
|
+
// HocrWord expects top-left origin like images/OCR output
|
|
221
210
|
let top_in_image_coords = (page_height as f32 - top).round() as i32;
|
|
222
211
|
|
|
212
|
+
// PDF text has high confidence (no OCR uncertainty)
|
|
223
213
|
let confidence = 95.0;
|
|
224
214
|
|
|
215
|
+
// Apply confidence threshold
|
|
225
216
|
if confidence < min_confidence {
|
|
226
217
|
return None;
|
|
227
218
|
}
|
|
@@ -236,7 +227,7 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
|
|
|
236
227
|
})
|
|
237
228
|
}
|
|
238
229
|
|
|
239
|
-
#[cfg(
|
|
230
|
+
#[cfg(test)]
|
|
240
231
|
mod tests {
|
|
241
232
|
use super::*;
|
|
242
233
|
|
|
@@ -279,18 +270,20 @@ mod tests {
|
|
|
279
270
|
height: 12.0,
|
|
280
271
|
}];
|
|
281
272
|
|
|
273
|
+
// Close character - same word
|
|
282
274
|
let close_char = CharInfo {
|
|
283
275
|
text: 'B',
|
|
284
|
-
x: 111.0,
|
|
276
|
+
x: 111.0, // 1 unit gap
|
|
285
277
|
y: 50.0,
|
|
286
278
|
width: 10.0,
|
|
287
279
|
height: 12.0,
|
|
288
280
|
};
|
|
289
281
|
assert!(!should_start_new_word(&chars, &close_char));
|
|
290
282
|
|
|
283
|
+
// Far character - new word
|
|
291
284
|
let far_char = CharInfo {
|
|
292
285
|
text: 'C',
|
|
293
|
-
x: 120.0,
|
|
286
|
+
x: 120.0, // 10 unit gap (> WORD_SPACING_THRESHOLD)
|
|
294
287
|
y: 50.0,
|
|
295
288
|
width: 10.0,
|
|
296
289
|
height: 12.0,
|
|
@@ -308,10 +301,11 @@ mod tests {
|
|
|
308
301
|
height: 12.0,
|
|
309
302
|
}];
|
|
310
303
|
|
|
304
|
+
// Character on different line
|
|
311
305
|
let new_line_char = CharInfo {
|
|
312
306
|
text: 'B',
|
|
313
307
|
x: 100.0,
|
|
314
|
-
y: 70.0,
|
|
308
|
+
y: 70.0, // Different y
|
|
315
309
|
width: 10.0,
|
|
316
310
|
height: 12.0,
|
|
317
311
|
};
|
|
@@ -342,7 +336,7 @@ mod tests {
|
|
|
342
336
|
|
|
343
337
|
assert_eq!(word.text, "Hi");
|
|
344
338
|
assert_eq!(word.left, 100);
|
|
345
|
-
assert_eq!(word.width, 18);
|
|
339
|
+
assert_eq!(word.width, 18); // 110 + 8 - 100
|
|
346
340
|
assert_eq!(word.height, 12);
|
|
347
341
|
assert_eq!(word.confidence, 95.0);
|
|
348
342
|
}
|
|
@@ -364,19 +358,22 @@ mod tests {
|
|
|
364
358
|
height: 12.0,
|
|
365
359
|
}];
|
|
366
360
|
|
|
361
|
+
// Low threshold - should pass
|
|
367
362
|
let word = finalize_word(&chars, 800, 90.0);
|
|
368
363
|
assert!(word.is_some());
|
|
369
364
|
|
|
365
|
+
// High threshold - should fail
|
|
370
366
|
let word = finalize_word(&chars, 800, 96.0);
|
|
371
367
|
assert!(word.is_none());
|
|
372
368
|
}
|
|
373
369
|
|
|
374
370
|
#[test]
|
|
375
371
|
fn test_coordinate_conversion() {
|
|
372
|
+
// Test PDF coordinate (bottom-left origin) to image coordinate (top-left origin)
|
|
376
373
|
let chars = vec![CharInfo {
|
|
377
374
|
text: 'A',
|
|
378
375
|
x: 100.0,
|
|
379
|
-
y: 700.0,
|
|
376
|
+
y: 700.0, // PDF coordinates: bottom-left origin
|
|
380
377
|
width: 10.0,
|
|
381
378
|
height: 12.0,
|
|
382
379
|
}];
|
|
@@ -384,11 +381,13 @@ mod tests {
|
|
|
384
381
|
let page_height = 800;
|
|
385
382
|
let word = finalize_word(&chars, page_height, 0.0).unwrap();
|
|
386
383
|
|
|
384
|
+
// top_in_image_coords = page_height - (y + height) = 800 - (700 + 12) = 88
|
|
387
385
|
assert_eq!(word.top, 88);
|
|
388
386
|
}
|
|
389
387
|
|
|
390
388
|
#[test]
|
|
391
389
|
fn test_word_bounding_box() {
|
|
390
|
+
// Test that bounding box encompasses all characters
|
|
392
391
|
let chars = vec![
|
|
393
392
|
CharInfo {
|
|
394
393
|
text: 'A',
|
|
@@ -400,18 +399,22 @@ mod tests {
|
|
|
400
399
|
CharInfo {
|
|
401
400
|
text: 'B',
|
|
402
401
|
x: 110.0,
|
|
403
|
-
y: 51.0,
|
|
402
|
+
y: 51.0, // Slightly different y
|
|
404
403
|
width: 10.0,
|
|
405
|
-
height: 13.0,
|
|
404
|
+
height: 13.0, // Slightly different height
|
|
406
405
|
},
|
|
407
406
|
];
|
|
408
407
|
|
|
409
408
|
let word = finalize_word(&chars, 800, 0.0).unwrap();
|
|
410
409
|
|
|
410
|
+
// Left should be minimum x
|
|
411
411
|
assert_eq!(word.left, 100);
|
|
412
412
|
|
|
413
|
-
|
|
413
|
+
// Width should span from leftmost to rightmost character
|
|
414
|
+
assert_eq!(word.width, 20); // 120 - 100
|
|
414
415
|
|
|
416
|
+
// Height should encompass both characters
|
|
417
|
+
// max(y+height) - min(y) = max(51+13, 50+12) - 50 = 64 - 50 = 14
|
|
415
418
|
assert_eq!(word.height, 14);
|
|
416
419
|
}
|
|
417
420
|
}
|